hardlyworking commited on
Commit
9576c91
·
verified ·
1 Parent(s): 7699583

Training in progress, step 272, checkpoint

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-136/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-136/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-272/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-272/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-272/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151645,
8
+ "head_dim": 128,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 12288,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 36,
15
+ "model_type": "qwen3",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 36,
18
+ "num_key_value_heads": 8,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.51.3",
26
+ "use_cache": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
checkpoint-272/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 2048,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-272/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-272/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ec4ca625a35e84d3503101386e6c8f8674eab84bc64dda7758e7345d63c0c02
3
+ size 4902257696
checkpoint-272/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:571c1ee44f1247ece2f7cda9048ed869222722ff5024eb5a0b33ff7d60b5e2fb
3
+ size 4915960368
checkpoint-272/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c48388f898f318b56263cbbf31485cf7e16df9832f043b467604b64d086c307
3
+ size 4983068496
checkpoint-272/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e98acbab238fdb3c80917cef2ec677a9ae2c2e62aca9e4d6ae286e57044ca9b
3
+ size 1580230264
checkpoint-272/model.safetensors.index.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16381470720
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
141
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
142
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
143
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
144
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
145
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
146
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
147
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
148
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
149
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
150
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
151
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
152
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
153
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
154
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
155
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
156
+ "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
157
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
158
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
160
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
161
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
162
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
163
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
164
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
165
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
166
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
167
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
168
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
169
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
170
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
171
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
172
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
173
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
179
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
180
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
181
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
182
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
183
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
184
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
273
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
277
+ "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
279
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
282
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
284
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
286
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
287
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
288
+ "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
289
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
290
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
291
+ "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
292
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
293
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
295
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
296
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
297
+ "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
299
+ "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
301
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
303
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
304
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
306
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
308
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
309
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
311
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
313
+ "model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
315
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
316
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
317
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
318
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
319
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
320
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
321
+ "model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
325
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
327
+ "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
328
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
329
+ "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
330
+ "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
331
+ "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
332
+ "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
333
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
335
+ "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
336
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
337
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
339
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
340
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
341
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
342
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
343
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
344
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
345
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
346
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
347
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
348
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
349
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
350
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
351
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
352
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
353
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
354
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
355
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
356
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
357
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
358
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
359
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
360
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
361
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
362
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
363
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
364
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
365
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
366
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
367
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
368
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
369
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
370
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
371
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
372
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
373
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
374
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
375
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
376
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
377
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
378
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
379
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
380
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
381
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
382
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
383
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
384
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
385
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
386
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
387
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
388
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
389
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
390
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
391
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
392
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
393
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
394
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
395
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
396
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
397
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
398
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
399
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
400
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
401
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
402
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
403
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
404
+ "model.norm.weight": "model-00004-of-00004.safetensors"
405
+ }
406
+ }
checkpoint-272/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d03a1829d4946065c5b67c9b6c8046108a7eaa659d6780663fbc091d18400be8
3
+ size 16637925676
checkpoint-272/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a02d728a763af949adfec4bf1d121e1daf1ef655fd3b17a42c2399e0221768dd
3
+ size 15024
checkpoint-272/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60087aa7d44f5751e6a709354a90b705404a21a9437680f20377cf12743341b
3
+ size 15024
checkpoint-272/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c962bfcedafeb4fe574f63e648405b9b28a01333d49c391529b2c7a4fead463
3
+ size 15024
checkpoint-272/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478ec022631f64178ab44a8355b560b75c942f4cf348cc636a8cb37ec3bec6cc
3
+ size 15024
checkpoint-272/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:846b6bb7ea8557d5f0c16d1ab2e2b5d3e020e5888f4779d7eb8505ca3a9dc6ef
3
+ size 1064
checkpoint-272/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-272/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
checkpoint-272/tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 131072,
236
+ "pad_token": "<|endoftext|>",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
checkpoint-272/trainer_state.json ADDED
@@ -0,0 +1,2074 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 17,
7
+ "global_step": 272,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007352941176470588,
14
+ "grad_norm": 1.9140625,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.6433,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.007352941176470588,
21
+ "eval_loss": 2.023996114730835,
22
+ "eval_runtime": 29.0075,
23
+ "eval_samples_per_second": 1.517,
24
+ "eval_steps_per_second": 0.379,
25
+ "step": 1
26
+ },
27
+ {
28
+ "epoch": 0.014705882352941176,
29
+ "grad_norm": 1.9921875,
30
+ "learning_rate": 7.692307692307694e-07,
31
+ "loss": 1.8443,
32
+ "step": 2
33
+ },
34
+ {
35
+ "epoch": 0.022058823529411766,
36
+ "grad_norm": 2.015625,
37
+ "learning_rate": 1.5384615384615387e-06,
38
+ "loss": 1.7175,
39
+ "step": 3
40
+ },
41
+ {
42
+ "epoch": 0.029411764705882353,
43
+ "grad_norm": 1.5625,
44
+ "learning_rate": 2.307692307692308e-06,
45
+ "loss": 1.5511,
46
+ "step": 4
47
+ },
48
+ {
49
+ "epoch": 0.03676470588235294,
50
+ "grad_norm": 1.4375,
51
+ "learning_rate": 3.0769230769230774e-06,
52
+ "loss": 1.3348,
53
+ "step": 5
54
+ },
55
+ {
56
+ "epoch": 0.04411764705882353,
57
+ "grad_norm": 1.8671875,
58
+ "learning_rate": 3.846153846153847e-06,
59
+ "loss": 1.6403,
60
+ "step": 6
61
+ },
62
+ {
63
+ "epoch": 0.051470588235294115,
64
+ "grad_norm": 1.9375,
65
+ "learning_rate": 4.615384615384616e-06,
66
+ "loss": 2.4032,
67
+ "step": 7
68
+ },
69
+ {
70
+ "epoch": 0.058823529411764705,
71
+ "grad_norm": 1.734375,
72
+ "learning_rate": 5.384615384615385e-06,
73
+ "loss": 1.6092,
74
+ "step": 8
75
+ },
76
+ {
77
+ "epoch": 0.0661764705882353,
78
+ "grad_norm": 1.6484375,
79
+ "learning_rate": 6.153846153846155e-06,
80
+ "loss": 1.5771,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.07352941176470588,
85
+ "grad_norm": 1.3515625,
86
+ "learning_rate": 6.923076923076923e-06,
87
+ "loss": 1.5274,
88
+ "step": 10
89
+ },
90
+ {
91
+ "epoch": 0.08088235294117647,
92
+ "grad_norm": 1.4609375,
93
+ "learning_rate": 7.692307692307694e-06,
94
+ "loss": 1.8263,
95
+ "step": 11
96
+ },
97
+ {
98
+ "epoch": 0.08823529411764706,
99
+ "grad_norm": 1.59375,
100
+ "learning_rate": 8.461538461538462e-06,
101
+ "loss": 1.9586,
102
+ "step": 12
103
+ },
104
+ {
105
+ "epoch": 0.09558823529411764,
106
+ "grad_norm": 1.4453125,
107
+ "learning_rate": 9.230769230769232e-06,
108
+ "loss": 1.9486,
109
+ "step": 13
110
+ },
111
+ {
112
+ "epoch": 0.10294117647058823,
113
+ "grad_norm": 1.2578125,
114
+ "learning_rate": 1e-05,
115
+ "loss": 2.0083,
116
+ "step": 14
117
+ },
118
+ {
119
+ "epoch": 0.11029411764705882,
120
+ "grad_norm": 1.3359375,
121
+ "learning_rate": 9.999632180371776e-06,
122
+ "loss": 1.3495,
123
+ "step": 15
124
+ },
125
+ {
126
+ "epoch": 0.11764705882352941,
127
+ "grad_norm": 1.3828125,
128
+ "learning_rate": 9.998528775603612e-06,
129
+ "loss": 1.45,
130
+ "step": 16
131
+ },
132
+ {
133
+ "epoch": 0.125,
134
+ "grad_norm": 1.515625,
135
+ "learning_rate": 9.996689948037081e-06,
136
+ "loss": 1.5696,
137
+ "step": 17
138
+ },
139
+ {
140
+ "epoch": 0.125,
141
+ "eval_loss": 2.0076510906219482,
142
+ "eval_runtime": 29.101,
143
+ "eval_samples_per_second": 1.512,
144
+ "eval_steps_per_second": 0.378,
145
+ "step": 17
146
+ },
147
+ {
148
+ "epoch": 0.1323529411764706,
149
+ "grad_norm": 1.296875,
150
+ "learning_rate": 9.994115968214933e-06,
151
+ "loss": 1.9919,
152
+ "step": 18
153
+ },
154
+ {
155
+ "epoch": 0.13970588235294118,
156
+ "grad_norm": 1.265625,
157
+ "learning_rate": 9.990807214841288e-06,
158
+ "loss": 1.1574,
159
+ "step": 19
160
+ },
161
+ {
162
+ "epoch": 0.14705882352941177,
163
+ "grad_norm": 1.3046875,
164
+ "learning_rate": 9.98676417472592e-06,
165
+ "loss": 1.7192,
166
+ "step": 20
167
+ },
168
+ {
169
+ "epoch": 0.15441176470588236,
170
+ "grad_norm": 1.3203125,
171
+ "learning_rate": 9.981987442712634e-06,
172
+ "loss": 1.42,
173
+ "step": 21
174
+ },
175
+ {
176
+ "epoch": 0.16176470588235295,
177
+ "grad_norm": 1.265625,
178
+ "learning_rate": 9.976477721591746e-06,
179
+ "loss": 1.903,
180
+ "step": 22
181
+ },
182
+ {
183
+ "epoch": 0.16911764705882354,
184
+ "grad_norm": 1.375,
185
+ "learning_rate": 9.97023582199669e-06,
186
+ "loss": 1.6847,
187
+ "step": 23
188
+ },
189
+ {
190
+ "epoch": 0.17647058823529413,
191
+ "grad_norm": 1.8046875,
192
+ "learning_rate": 9.963262662284735e-06,
193
+ "loss": 1.9539,
194
+ "step": 24
195
+ },
196
+ {
197
+ "epoch": 0.18382352941176472,
198
+ "grad_norm": 1.40625,
199
+ "learning_rate": 9.955559268401893e-06,
200
+ "loss": 1.8249,
201
+ "step": 25
202
+ },
203
+ {
204
+ "epoch": 0.19117647058823528,
205
+ "grad_norm": 1.1796875,
206
+ "learning_rate": 9.947126773731949e-06,
207
+ "loss": 1.5185,
208
+ "step": 26
209
+ },
210
+ {
211
+ "epoch": 0.19852941176470587,
212
+ "grad_norm": 1.3125,
213
+ "learning_rate": 9.937966418929725e-06,
214
+ "loss": 1.9394,
215
+ "step": 27
216
+ },
217
+ {
218
+ "epoch": 0.20588235294117646,
219
+ "grad_norm": 1.390625,
220
+ "learning_rate": 9.928079551738542e-06,
221
+ "loss": 1.5664,
222
+ "step": 28
223
+ },
224
+ {
225
+ "epoch": 0.21323529411764705,
226
+ "grad_norm": 1.1796875,
227
+ "learning_rate": 9.917467626791925e-06,
228
+ "loss": 1.5556,
229
+ "step": 29
230
+ },
231
+ {
232
+ "epoch": 0.22058823529411764,
233
+ "grad_norm": 1.1796875,
234
+ "learning_rate": 9.90613220539959e-06,
235
+ "loss": 1.4611,
236
+ "step": 30
237
+ },
238
+ {
239
+ "epoch": 0.22794117647058823,
240
+ "grad_norm": 1.078125,
241
+ "learning_rate": 9.89407495531773e-06,
242
+ "loss": 1.0783,
243
+ "step": 31
244
+ },
245
+ {
246
+ "epoch": 0.23529411764705882,
247
+ "grad_norm": 1.21875,
248
+ "learning_rate": 9.881297650503641e-06,
249
+ "loss": 1.8567,
250
+ "step": 32
251
+ },
252
+ {
253
+ "epoch": 0.2426470588235294,
254
+ "grad_norm": 1.4453125,
255
+ "learning_rate": 9.867802170854724e-06,
256
+ "loss": 1.8233,
257
+ "step": 33
258
+ },
259
+ {
260
+ "epoch": 0.25,
261
+ "grad_norm": 1.875,
262
+ "learning_rate": 9.853590501931905e-06,
263
+ "loss": 2.5862,
264
+ "step": 34
265
+ },
266
+ {
267
+ "epoch": 0.25,
268
+ "eval_loss": 1.9994938373565674,
269
+ "eval_runtime": 28.2547,
270
+ "eval_samples_per_second": 1.557,
271
+ "eval_steps_per_second": 0.389,
272
+ "step": 34
273
+ },
274
+ {
275
+ "epoch": 0.25735294117647056,
276
+ "grad_norm": 1.296875,
277
+ "learning_rate": 9.838664734667496e-06,
278
+ "loss": 1.8706,
279
+ "step": 35
280
+ },
281
+ {
282
+ "epoch": 0.2647058823529412,
283
+ "grad_norm": 1.4765625,
284
+ "learning_rate": 9.82302706505756e-06,
285
+ "loss": 1.783,
286
+ "step": 36
287
+ },
288
+ {
289
+ "epoch": 0.27205882352941174,
290
+ "grad_norm": 1.3125,
291
+ "learning_rate": 9.806679793838829e-06,
292
+ "loss": 1.6925,
293
+ "step": 37
294
+ },
295
+ {
296
+ "epoch": 0.27941176470588236,
297
+ "grad_norm": 1.2109375,
298
+ "learning_rate": 9.78962532615019e-06,
299
+ "loss": 1.4931,
300
+ "step": 38
301
+ },
302
+ {
303
+ "epoch": 0.2867647058823529,
304
+ "grad_norm": 1.4765625,
305
+ "learning_rate": 9.771866171178832e-06,
306
+ "loss": 1.7531,
307
+ "step": 39
308
+ },
309
+ {
310
+ "epoch": 0.29411764705882354,
311
+ "grad_norm": 1.140625,
312
+ "learning_rate": 9.753404941791063e-06,
313
+ "loss": 1.7975,
314
+ "step": 40
315
+ },
316
+ {
317
+ "epoch": 0.3014705882352941,
318
+ "grad_norm": 1.09375,
319
+ "learning_rate": 9.734244354147897e-06,
320
+ "loss": 1.7083,
321
+ "step": 41
322
+ },
323
+ {
324
+ "epoch": 0.3088235294117647,
325
+ "grad_norm": 1.359375,
326
+ "learning_rate": 9.714387227305422e-06,
327
+ "loss": 1.6907,
328
+ "step": 42
329
+ },
330
+ {
331
+ "epoch": 0.3161764705882353,
332
+ "grad_norm": 1.2421875,
333
+ "learning_rate": 9.693836482800044e-06,
334
+ "loss": 1.9384,
335
+ "step": 43
336
+ },
337
+ {
338
+ "epoch": 0.3235294117647059,
339
+ "grad_norm": 1.3515625,
340
+ "learning_rate": 9.672595144218646e-06,
341
+ "loss": 1.8993,
342
+ "step": 44
343
+ },
344
+ {
345
+ "epoch": 0.33088235294117646,
346
+ "grad_norm": 1.1875,
347
+ "learning_rate": 9.65066633675373e-06,
348
+ "loss": 1.6913,
349
+ "step": 45
350
+ },
351
+ {
352
+ "epoch": 0.3382352941176471,
353
+ "grad_norm": 1.0390625,
354
+ "learning_rate": 9.628053286743619e-06,
355
+ "loss": 1.4722,
356
+ "step": 46
357
+ },
358
+ {
359
+ "epoch": 0.34558823529411764,
360
+ "grad_norm": 1.59375,
361
+ "learning_rate": 9.604759321197775e-06,
362
+ "loss": 1.7211,
363
+ "step": 47
364
+ },
365
+ {
366
+ "epoch": 0.35294117647058826,
367
+ "grad_norm": 1.375,
368
+ "learning_rate": 9.580787867307293e-06,
369
+ "loss": 1.9391,
370
+ "step": 48
371
+ },
372
+ {
373
+ "epoch": 0.3602941176470588,
374
+ "grad_norm": 1.3125,
375
+ "learning_rate": 9.55614245194068e-06,
376
+ "loss": 1.6147,
377
+ "step": 49
378
+ },
379
+ {
380
+ "epoch": 0.36764705882352944,
381
+ "grad_norm": 1.1484375,
382
+ "learning_rate": 9.53082670112494e-06,
383
+ "loss": 1.5916,
384
+ "step": 50
385
+ },
386
+ {
387
+ "epoch": 0.375,
388
+ "grad_norm": 1.375,
389
+ "learning_rate": 9.504844339512096e-06,
390
+ "loss": 1.9231,
391
+ "step": 51
392
+ },
393
+ {
394
+ "epoch": 0.375,
395
+ "eval_loss": 1.9944037199020386,
396
+ "eval_runtime": 28.9271,
397
+ "eval_samples_per_second": 1.521,
398
+ "eval_steps_per_second": 0.38,
399
+ "step": 51
400
+ },
401
+ {
402
+ "epoch": 0.38235294117647056,
403
+ "grad_norm": 1.109375,
404
+ "learning_rate": 9.478199189831184e-06,
405
+ "loss": 1.6755,
406
+ "step": 52
407
+ },
408
+ {
409
+ "epoch": 0.3897058823529412,
410
+ "grad_norm": 1.015625,
411
+ "learning_rate": 9.450895172325822e-06,
412
+ "loss": 1.6623,
413
+ "step": 53
414
+ },
415
+ {
416
+ "epoch": 0.39705882352941174,
417
+ "grad_norm": 1.109375,
418
+ "learning_rate": 9.422936304177439e-06,
419
+ "loss": 1.2844,
420
+ "step": 54
421
+ },
422
+ {
423
+ "epoch": 0.40441176470588236,
424
+ "grad_norm": 1.3203125,
425
+ "learning_rate": 9.394326698914229e-06,
426
+ "loss": 1.779,
427
+ "step": 55
428
+ },
429
+ {
430
+ "epoch": 0.4117647058823529,
431
+ "grad_norm": 1.21875,
432
+ "learning_rate": 9.365070565805941e-06,
433
+ "loss": 1.2299,
434
+ "step": 56
435
+ },
436
+ {
437
+ "epoch": 0.41911764705882354,
438
+ "grad_norm": 1.21875,
439
+ "learning_rate": 9.335172209244577e-06,
440
+ "loss": 1.9589,
441
+ "step": 57
442
+ },
443
+ {
444
+ "epoch": 0.4264705882352941,
445
+ "grad_norm": 1.78125,
446
+ "learning_rate": 9.304636028111093e-06,
447
+ "loss": 1.7626,
448
+ "step": 58
449
+ },
450
+ {
451
+ "epoch": 0.4338235294117647,
452
+ "grad_norm": 1.1875,
453
+ "learning_rate": 9.273466515128209e-06,
454
+ "loss": 1.4811,
455
+ "step": 59
456
+ },
457
+ {
458
+ "epoch": 0.4411764705882353,
459
+ "grad_norm": 0.90625,
460
+ "learning_rate": 9.241668256199392e-06,
461
+ "loss": 1.3207,
462
+ "step": 60
463
+ },
464
+ {
465
+ "epoch": 0.4485294117647059,
466
+ "grad_norm": 0.875,
467
+ "learning_rate": 9.209245929734156e-06,
468
+ "loss": 1.0984,
469
+ "step": 61
470
+ },
471
+ {
472
+ "epoch": 0.45588235294117646,
473
+ "grad_norm": 1.25,
474
+ "learning_rate": 9.176204305959727e-06,
475
+ "loss": 1.7514,
476
+ "step": 62
477
+ },
478
+ {
479
+ "epoch": 0.4632352941176471,
480
+ "grad_norm": 1.40625,
481
+ "learning_rate": 9.142548246219212e-06,
482
+ "loss": 2.0432,
483
+ "step": 63
484
+ },
485
+ {
486
+ "epoch": 0.47058823529411764,
487
+ "grad_norm": 3.09375,
488
+ "learning_rate": 9.108282702256366e-06,
489
+ "loss": 3.1275,
490
+ "step": 64
491
+ },
492
+ {
493
+ "epoch": 0.47794117647058826,
494
+ "grad_norm": 1.1796875,
495
+ "learning_rate": 9.073412715487045e-06,
496
+ "loss": 2.0289,
497
+ "step": 65
498
+ },
499
+ {
500
+ "epoch": 0.4852941176470588,
501
+ "grad_norm": 1.1015625,
502
+ "learning_rate": 9.037943416257475e-06,
503
+ "loss": 1.8555,
504
+ "step": 66
505
+ },
506
+ {
507
+ "epoch": 0.49264705882352944,
508
+ "grad_norm": 1.1953125,
509
+ "learning_rate": 9.001880023089442e-06,
510
+ "loss": 1.8256,
511
+ "step": 67
512
+ },
513
+ {
514
+ "epoch": 0.5,
515
+ "grad_norm": 1.34375,
516
+ "learning_rate": 8.96522784191249e-06,
517
+ "loss": 1.9655,
518
+ "step": 68
519
+ },
520
+ {
521
+ "epoch": 0.5,
522
+ "eval_loss": 1.9907618761062622,
523
+ "eval_runtime": 29.101,
524
+ "eval_samples_per_second": 1.512,
525
+ "eval_steps_per_second": 0.378,
526
+ "step": 68
527
+ },
528
+ {
529
+ "epoch": 0.5073529411764706,
530
+ "grad_norm": 0.8515625,
531
+ "learning_rate": 8.927992265283282e-06,
532
+ "loss": 1.2591,
533
+ "step": 69
534
+ },
535
+ {
536
+ "epoch": 0.5147058823529411,
537
+ "grad_norm": 1.046875,
538
+ "learning_rate": 8.890178771592198e-06,
539
+ "loss": 1.644,
540
+ "step": 70
541
+ },
542
+ {
543
+ "epoch": 0.5220588235294118,
544
+ "grad_norm": 2.03125,
545
+ "learning_rate": 8.851792924257316e-06,
546
+ "loss": 1.6287,
547
+ "step": 71
548
+ },
549
+ {
550
+ "epoch": 0.5294117647058824,
551
+ "grad_norm": 1.125,
552
+ "learning_rate": 8.812840370905872e-06,
553
+ "loss": 1.5483,
554
+ "step": 72
555
+ },
556
+ {
557
+ "epoch": 0.5367647058823529,
558
+ "grad_norm": 1.125,
559
+ "learning_rate": 8.773326842543348e-06,
560
+ "loss": 1.664,
561
+ "step": 73
562
+ },
563
+ {
564
+ "epoch": 0.5441176470588235,
565
+ "grad_norm": 1.3046875,
566
+ "learning_rate": 8.733258152710262e-06,
567
+ "loss": 1.9707,
568
+ "step": 74
569
+ },
570
+ {
571
+ "epoch": 0.5514705882352942,
572
+ "grad_norm": 1.046875,
573
+ "learning_rate": 8.692640196626859e-06,
574
+ "loss": 1.6404,
575
+ "step": 75
576
+ },
577
+ {
578
+ "epoch": 0.5588235294117647,
579
+ "grad_norm": 1.1171875,
580
+ "learning_rate": 8.651478950325739e-06,
581
+ "loss": 1.5971,
582
+ "step": 76
583
+ },
584
+ {
585
+ "epoch": 0.5661764705882353,
586
+ "grad_norm": 1.21875,
587
+ "learning_rate": 8.609780469772623e-06,
588
+ "loss": 1.7441,
589
+ "step": 77
590
+ },
591
+ {
592
+ "epoch": 0.5735294117647058,
593
+ "grad_norm": 1.6171875,
594
+ "learning_rate": 8.567550889975362e-06,
595
+ "loss": 1.7133,
596
+ "step": 78
597
+ },
598
+ {
599
+ "epoch": 0.5808823529411765,
600
+ "grad_norm": 1.078125,
601
+ "learning_rate": 8.524796424081291e-06,
602
+ "loss": 1.3789,
603
+ "step": 79
604
+ },
605
+ {
606
+ "epoch": 0.5882352941176471,
607
+ "grad_norm": 1.0078125,
608
+ "learning_rate": 8.481523362463111e-06,
609
+ "loss": 1.3651,
610
+ "step": 80
611
+ },
612
+ {
613
+ "epoch": 0.5955882352941176,
614
+ "grad_norm": 1.1015625,
615
+ "learning_rate": 8.437738071793394e-06,
616
+ "loss": 2.0325,
617
+ "step": 81
618
+ },
619
+ {
620
+ "epoch": 0.6029411764705882,
621
+ "grad_norm": 1.125,
622
+ "learning_rate": 8.393446994107876e-06,
623
+ "loss": 1.6342,
624
+ "step": 82
625
+ },
626
+ {
627
+ "epoch": 0.6102941176470589,
628
+ "grad_norm": 1.3046875,
629
+ "learning_rate": 8.348656645857648e-06,
630
+ "loss": 1.9204,
631
+ "step": 83
632
+ },
633
+ {
634
+ "epoch": 0.6176470588235294,
635
+ "grad_norm": 2.0625,
636
+ "learning_rate": 8.303373616950408e-06,
637
+ "loss": 2.6573,
638
+ "step": 84
639
+ },
640
+ {
641
+ "epoch": 0.625,
642
+ "grad_norm": 1.0859375,
643
+ "learning_rate": 8.257604569780898e-06,
644
+ "loss": 1.6909,
645
+ "step": 85
646
+ },
647
+ {
648
+ "epoch": 0.625,
649
+ "eval_loss": 1.9879568815231323,
650
+ "eval_runtime": 28.4352,
651
+ "eval_samples_per_second": 1.547,
652
+ "eval_steps_per_second": 0.387,
653
+ "step": 85
654
+ },
655
+ {
656
+ "epoch": 0.6323529411764706,
657
+ "grad_norm": 1.03125,
658
+ "learning_rate": 8.21135623825068e-06,
659
+ "loss": 1.6941,
660
+ "step": 86
661
+ },
662
+ {
663
+ "epoch": 0.6397058823529411,
664
+ "grad_norm": 1.3046875,
665
+ "learning_rate": 8.164635426777404e-06,
666
+ "loss": 1.2534,
667
+ "step": 87
668
+ },
669
+ {
670
+ "epoch": 0.6470588235294118,
671
+ "grad_norm": 1.125,
672
+ "learning_rate": 8.117449009293668e-06,
673
+ "loss": 1.5544,
674
+ "step": 88
675
+ },
676
+ {
677
+ "epoch": 0.6544117647058824,
678
+ "grad_norm": 1.359375,
679
+ "learning_rate": 8.069803928235689e-06,
680
+ "loss": 1.2404,
681
+ "step": 89
682
+ },
683
+ {
684
+ "epoch": 0.6617647058823529,
685
+ "grad_norm": 1.015625,
686
+ "learning_rate": 8.021707193521865e-06,
687
+ "loss": 1.3714,
688
+ "step": 90
689
+ },
690
+ {
691
+ "epoch": 0.6691176470588235,
692
+ "grad_norm": 1.078125,
693
+ "learning_rate": 7.973165881521435e-06,
694
+ "loss": 1.5804,
695
+ "step": 91
696
+ },
697
+ {
698
+ "epoch": 0.6764705882352942,
699
+ "grad_norm": 1.3359375,
700
+ "learning_rate": 7.924187134013323e-06,
701
+ "loss": 1.627,
702
+ "step": 92
703
+ },
704
+ {
705
+ "epoch": 0.6838235294117647,
706
+ "grad_norm": 1.046875,
707
+ "learning_rate": 7.874778157135416e-06,
708
+ "loss": 1.3964,
709
+ "step": 93
710
+ },
711
+ {
712
+ "epoch": 0.6911764705882353,
713
+ "grad_norm": 1.1328125,
714
+ "learning_rate": 7.824946220324313e-06,
715
+ "loss": 1.5446,
716
+ "step": 94
717
+ },
718
+ {
719
+ "epoch": 0.6985294117647058,
720
+ "grad_norm": 1.4609375,
721
+ "learning_rate": 7.774698655245802e-06,
722
+ "loss": 1.7198,
723
+ "step": 95
724
+ },
725
+ {
726
+ "epoch": 0.7058823529411765,
727
+ "grad_norm": 0.98046875,
728
+ "learning_rate": 7.724042854716169e-06,
729
+ "loss": 1.594,
730
+ "step": 96
731
+ },
732
+ {
733
+ "epoch": 0.7132352941176471,
734
+ "grad_norm": 1.109375,
735
+ "learning_rate": 7.6729862716145e-06,
736
+ "loss": 1.7797,
737
+ "step": 97
738
+ },
739
+ {
740
+ "epoch": 0.7205882352941176,
741
+ "grad_norm": 1.3828125,
742
+ "learning_rate": 7.621536417786159e-06,
743
+ "loss": 2.3605,
744
+ "step": 98
745
+ },
746
+ {
747
+ "epoch": 0.7279411764705882,
748
+ "grad_norm": 0.9765625,
749
+ "learning_rate": 7.56970086293759e-06,
750
+ "loss": 1.5639,
751
+ "step": 99
752
+ },
753
+ {
754
+ "epoch": 0.7352941176470589,
755
+ "grad_norm": 1.3203125,
756
+ "learning_rate": 7.5174872335226e-06,
757
+ "loss": 1.9192,
758
+ "step": 100
759
+ },
760
+ {
761
+ "epoch": 0.7426470588235294,
762
+ "grad_norm": 0.9765625,
763
+ "learning_rate": 7.464903211620291e-06,
764
+ "loss": 1.7064,
765
+ "step": 101
766
+ },
767
+ {
768
+ "epoch": 0.75,
769
+ "grad_norm": 1.265625,
770
+ "learning_rate": 7.4119565338048195e-06,
771
+ "loss": 1.8634,
772
+ "step": 102
773
+ },
774
+ {
775
+ "epoch": 0.75,
776
+ "eval_loss": 1.9857271909713745,
777
+ "eval_runtime": 28.8833,
778
+ "eval_samples_per_second": 1.523,
779
+ "eval_steps_per_second": 0.381,
780
+ "step": 102
781
+ },
782
+ {
783
+ "epoch": 0.7573529411764706,
784
+ "grad_norm": 1.0703125,
785
+ "learning_rate": 7.358654990007123e-06,
786
+ "loss": 1.6888,
787
+ "step": 103
788
+ },
789
+ {
790
+ "epoch": 0.7647058823529411,
791
+ "grad_norm": 1.1640625,
792
+ "learning_rate": 7.305006422368811e-06,
793
+ "loss": 1.5068,
794
+ "step": 104
795
+ },
796
+ {
797
+ "epoch": 0.7720588235294118,
798
+ "grad_norm": 0.8984375,
799
+ "learning_rate": 7.251018724088367e-06,
800
+ "loss": 1.2948,
801
+ "step": 105
802
+ },
803
+ {
804
+ "epoch": 0.7794117647058824,
805
+ "grad_norm": 1.46875,
806
+ "learning_rate": 7.196699838259834e-06,
807
+ "loss": 1.7578,
808
+ "step": 106
809
+ },
810
+ {
811
+ "epoch": 0.7867647058823529,
812
+ "grad_norm": 1.171875,
813
+ "learning_rate": 7.142057756704168e-06,
814
+ "loss": 1.8871,
815
+ "step": 107
816
+ },
817
+ {
818
+ "epoch": 0.7941176470588235,
819
+ "grad_norm": 1.046875,
820
+ "learning_rate": 7.087100518793421e-06,
821
+ "loss": 1.6245,
822
+ "step": 108
823
+ },
824
+ {
825
+ "epoch": 0.8014705882352942,
826
+ "grad_norm": 1.09375,
827
+ "learning_rate": 7.031836210267915e-06,
828
+ "loss": 1.9928,
829
+ "step": 109
830
+ },
831
+ {
832
+ "epoch": 0.8088235294117647,
833
+ "grad_norm": 1.03125,
834
+ "learning_rate": 6.976272962046619e-06,
835
+ "loss": 1.6471,
836
+ "step": 110
837
+ },
838
+ {
839
+ "epoch": 0.8161764705882353,
840
+ "grad_norm": 1.171875,
841
+ "learning_rate": 6.920418949030856e-06,
842
+ "loss": 2.0028,
843
+ "step": 111
844
+ },
845
+ {
846
+ "epoch": 0.8235294117647058,
847
+ "grad_norm": 1.0234375,
848
+ "learning_rate": 6.864282388901544e-06,
849
+ "loss": 1.3726,
850
+ "step": 112
851
+ },
852
+ {
853
+ "epoch": 0.8308823529411765,
854
+ "grad_norm": 1.015625,
855
+ "learning_rate": 6.807871540910155e-06,
856
+ "loss": 1.7367,
857
+ "step": 113
858
+ },
859
+ {
860
+ "epoch": 0.8382352941176471,
861
+ "grad_norm": 1.0703125,
862
+ "learning_rate": 6.751194704663544e-06,
863
+ "loss": 1.6541,
864
+ "step": 114
865
+ },
866
+ {
867
+ "epoch": 0.8455882352941176,
868
+ "grad_norm": 1.03125,
869
+ "learning_rate": 6.694260218902845e-06,
870
+ "loss": 1.8604,
871
+ "step": 115
872
+ },
873
+ {
874
+ "epoch": 0.8529411764705882,
875
+ "grad_norm": 1.21875,
876
+ "learning_rate": 6.637076460276612e-06,
877
+ "loss": 1.8829,
878
+ "step": 116
879
+ },
880
+ {
881
+ "epoch": 0.8602941176470589,
882
+ "grad_norm": 1.0390625,
883
+ "learning_rate": 6.579651842108381e-06,
884
+ "loss": 1.692,
885
+ "step": 117
886
+ },
887
+ {
888
+ "epoch": 0.8676470588235294,
889
+ "grad_norm": 1.2734375,
890
+ "learning_rate": 6.521994813158834e-06,
891
+ "loss": 2.1307,
892
+ "step": 118
893
+ },
894
+ {
895
+ "epoch": 0.875,
896
+ "grad_norm": 1.140625,
897
+ "learning_rate": 6.464113856382752e-06,
898
+ "loss": 1.6684,
899
+ "step": 119
900
+ },
901
+ {
902
+ "epoch": 0.875,
903
+ "eval_loss": 1.9832121133804321,
904
+ "eval_runtime": 28.1006,
905
+ "eval_samples_per_second": 1.566,
906
+ "eval_steps_per_second": 0.391,
907
+ "step": 119
908
+ },
909
+ {
910
+ "epoch": 0.8823529411764706,
911
+ "grad_norm": 1.1015625,
912
+ "learning_rate": 6.406017487680938e-06,
913
+ "loss": 1.8429,
914
+ "step": 120
915
+ },
916
+ {
917
+ "epoch": 0.8897058823529411,
918
+ "grad_norm": 1.015625,
919
+ "learning_rate": 6.3477142546472836e-06,
920
+ "loss": 1.748,
921
+ "step": 121
922
+ },
923
+ {
924
+ "epoch": 0.8970588235294118,
925
+ "grad_norm": 1.1875,
926
+ "learning_rate": 6.28921273531119e-06,
927
+ "loss": 1.4421,
928
+ "step": 122
929
+ },
930
+ {
931
+ "epoch": 0.9044117647058824,
932
+ "grad_norm": 1.5234375,
933
+ "learning_rate": 6.230521536875494e-06,
934
+ "loss": 1.8182,
935
+ "step": 123
936
+ },
937
+ {
938
+ "epoch": 0.9117647058823529,
939
+ "grad_norm": 1.2734375,
940
+ "learning_rate": 6.171649294450113e-06,
941
+ "loss": 1.9269,
942
+ "step": 124
943
+ },
944
+ {
945
+ "epoch": 0.9191176470588235,
946
+ "grad_norm": 1.234375,
947
+ "learning_rate": 6.112604669781572e-06,
948
+ "loss": 1.6172,
949
+ "step": 125
950
+ },
951
+ {
952
+ "epoch": 0.9264705882352942,
953
+ "grad_norm": 1.3046875,
954
+ "learning_rate": 6.053396349978632e-06,
955
+ "loss": 1.7184,
956
+ "step": 126
957
+ },
958
+ {
959
+ "epoch": 0.9338235294117647,
960
+ "grad_norm": 1.0234375,
961
+ "learning_rate": 5.994033046234163e-06,
962
+ "loss": 1.3447,
963
+ "step": 127
964
+ },
965
+ {
966
+ "epoch": 0.9411764705882353,
967
+ "grad_norm": 0.890625,
968
+ "learning_rate": 5.934523492543489e-06,
969
+ "loss": 1.158,
970
+ "step": 128
971
+ },
972
+ {
973
+ "epoch": 0.9485294117647058,
974
+ "grad_norm": 1.21875,
975
+ "learning_rate": 5.874876444419377e-06,
976
+ "loss": 2.0378,
977
+ "step": 129
978
+ },
979
+ {
980
+ "epoch": 0.9558823529411765,
981
+ "grad_norm": 1.0703125,
982
+ "learning_rate": 5.815100677603854e-06,
983
+ "loss": 1.7901,
984
+ "step": 130
985
+ },
986
+ {
987
+ "epoch": 0.9632352941176471,
988
+ "grad_norm": 1.15625,
989
+ "learning_rate": 5.75520498677705e-06,
990
+ "loss": 1.6435,
991
+ "step": 131
992
+ },
993
+ {
994
+ "epoch": 0.9705882352941176,
995
+ "grad_norm": 1.1484375,
996
+ "learning_rate": 5.695198184263259e-06,
997
+ "loss": 1.8977,
998
+ "step": 132
999
+ },
1000
+ {
1001
+ "epoch": 0.9779411764705882,
1002
+ "grad_norm": 1.046875,
1003
+ "learning_rate": 5.635089098734394e-06,
1004
+ "loss": 1.4106,
1005
+ "step": 133
1006
+ },
1007
+ {
1008
+ "epoch": 0.9852941176470589,
1009
+ "grad_norm": 1.234375,
1010
+ "learning_rate": 5.574886573911056e-06,
1011
+ "loss": 1.8682,
1012
+ "step": 134
1013
+ },
1014
+ {
1015
+ "epoch": 0.9926470588235294,
1016
+ "grad_norm": 0.90234375,
1017
+ "learning_rate": 5.514599467261363e-06,
1018
+ "loss": 0.9243,
1019
+ "step": 135
1020
+ },
1021
+ {
1022
+ "epoch": 1.0,
1023
+ "grad_norm": 2.578125,
1024
+ "learning_rate": 5.454236648697776e-06,
1025
+ "loss": 1.8727,
1026
+ "step": 136
1027
+ },
1028
+ {
1029
+ "epoch": 1.0,
1030
+ "eval_loss": 1.981597661972046,
1031
+ "eval_runtime": 29.087,
1032
+ "eval_samples_per_second": 1.513,
1033
+ "eval_steps_per_second": 0.378,
1034
+ "step": 136
1035
+ },
1036
+ {
1037
+ "epoch": 1.0073529411764706,
1038
+ "grad_norm": 0.99609375,
1039
+ "learning_rate": 5.3938069992720894e-06,
1040
+ "loss": 1.6062,
1041
+ "step": 137
1042
+ },
1043
+ {
1044
+ "epoch": 1.0147058823529411,
1045
+ "grad_norm": 1.1171875,
1046
+ "learning_rate": 5.333319409868777e-06,
1047
+ "loss": 1.8066,
1048
+ "step": 138
1049
+ },
1050
+ {
1051
+ "epoch": 1.0220588235294117,
1052
+ "grad_norm": 1.1953125,
1053
+ "learning_rate": 5.272782779896898e-06,
1054
+ "loss": 1.6509,
1055
+ "step": 139
1056
+ },
1057
+ {
1058
+ "epoch": 1.0294117647058822,
1059
+ "grad_norm": 0.98046875,
1060
+ "learning_rate": 5.212206015980742e-06,
1061
+ "loss": 1.5008,
1062
+ "step": 140
1063
+ },
1064
+ {
1065
+ "epoch": 1.036764705882353,
1066
+ "grad_norm": 0.98828125,
1067
+ "learning_rate": 5.151598030649425e-06,
1068
+ "loss": 1.3041,
1069
+ "step": 141
1070
+ },
1071
+ {
1072
+ "epoch": 1.0441176470588236,
1073
+ "grad_norm": 1.2109375,
1074
+ "learning_rate": 5.090967741025599e-06,
1075
+ "loss": 1.5998,
1076
+ "step": 142
1077
+ },
1078
+ {
1079
+ "epoch": 1.0514705882352942,
1080
+ "grad_norm": 1.9375,
1081
+ "learning_rate": 5.030324067513499e-06,
1082
+ "loss": 2.3704,
1083
+ "step": 143
1084
+ },
1085
+ {
1086
+ "epoch": 1.0588235294117647,
1087
+ "grad_norm": 1.0234375,
1088
+ "learning_rate": 4.969675932486503e-06,
1089
+ "loss": 1.5675,
1090
+ "step": 144
1091
+ },
1092
+ {
1093
+ "epoch": 1.0661764705882353,
1094
+ "grad_norm": 1.0390625,
1095
+ "learning_rate": 4.909032258974403e-06,
1096
+ "loss": 1.5082,
1097
+ "step": 145
1098
+ },
1099
+ {
1100
+ "epoch": 1.0735294117647058,
1101
+ "grad_norm": 0.9765625,
1102
+ "learning_rate": 4.848401969350577e-06,
1103
+ "loss": 1.4901,
1104
+ "step": 146
1105
+ },
1106
+ {
1107
+ "epoch": 1.0808823529411764,
1108
+ "grad_norm": 1.0234375,
1109
+ "learning_rate": 4.78779398401926e-06,
1110
+ "loss": 1.7717,
1111
+ "step": 147
1112
+ },
1113
+ {
1114
+ "epoch": 1.088235294117647,
1115
+ "grad_norm": 1.0234375,
1116
+ "learning_rate": 4.7272172201031055e-06,
1117
+ "loss": 1.8931,
1118
+ "step": 148
1119
+ },
1120
+ {
1121
+ "epoch": 1.0955882352941178,
1122
+ "grad_norm": 1.125,
1123
+ "learning_rate": 4.666680590131225e-06,
1124
+ "loss": 1.916,
1125
+ "step": 149
1126
+ },
1127
+ {
1128
+ "epoch": 1.1029411764705883,
1129
+ "grad_norm": 1.03125,
1130
+ "learning_rate": 4.606193000727913e-06,
1131
+ "loss": 1.9528,
1132
+ "step": 150
1133
+ },
1134
+ {
1135
+ "epoch": 1.1102941176470589,
1136
+ "grad_norm": 1.0,
1137
+ "learning_rate": 4.545763351302224e-06,
1138
+ "loss": 1.3164,
1139
+ "step": 151
1140
+ },
1141
+ {
1142
+ "epoch": 1.1176470588235294,
1143
+ "grad_norm": 1.4453125,
1144
+ "learning_rate": 4.485400532738638e-06,
1145
+ "loss": 1.4166,
1146
+ "step": 152
1147
+ },
1148
+ {
1149
+ "epoch": 1.125,
1150
+ "grad_norm": 1.1328125,
1151
+ "learning_rate": 4.425113426088945e-06,
1152
+ "loss": 1.542,
1153
+ "step": 153
1154
+ },
1155
+ {
1156
+ "epoch": 1.125,
1157
+ "eval_loss": 1.9806106090545654,
1158
+ "eval_runtime": 28.4601,
1159
+ "eval_samples_per_second": 1.546,
1160
+ "eval_steps_per_second": 0.387,
1161
+ "step": 153
1162
+ },
1163
+ {
1164
+ "epoch": 1.1323529411764706,
1165
+ "grad_norm": 1.046875,
1166
+ "learning_rate": 4.364910901265607e-06,
1167
+ "loss": 1.9457,
1168
+ "step": 154
1169
+ },
1170
+ {
1171
+ "epoch": 1.1397058823529411,
1172
+ "grad_norm": 0.91015625,
1173
+ "learning_rate": 4.3048018157367435e-06,
1174
+ "loss": 1.1295,
1175
+ "step": 155
1176
+ },
1177
+ {
1178
+ "epoch": 1.1470588235294117,
1179
+ "grad_norm": 1.0390625,
1180
+ "learning_rate": 4.244795013222951e-06,
1181
+ "loss": 1.6723,
1182
+ "step": 156
1183
+ },
1184
+ {
1185
+ "epoch": 1.1544117647058822,
1186
+ "grad_norm": 1.0546875,
1187
+ "learning_rate": 4.184899322396147e-06,
1188
+ "loss": 1.3933,
1189
+ "step": 157
1190
+ },
1191
+ {
1192
+ "epoch": 1.161764705882353,
1193
+ "grad_norm": 1.0234375,
1194
+ "learning_rate": 4.125123555580624e-06,
1195
+ "loss": 1.8577,
1196
+ "step": 158
1197
+ },
1198
+ {
1199
+ "epoch": 1.1691176470588236,
1200
+ "grad_norm": 1.0703125,
1201
+ "learning_rate": 4.0654765074565125e-06,
1202
+ "loss": 1.6555,
1203
+ "step": 159
1204
+ },
1205
+ {
1206
+ "epoch": 1.1764705882352942,
1207
+ "grad_norm": 1.5390625,
1208
+ "learning_rate": 4.00596695376584e-06,
1209
+ "loss": 1.9155,
1210
+ "step": 160
1211
+ },
1212
+ {
1213
+ "epoch": 1.1838235294117647,
1214
+ "grad_norm": 1.2109375,
1215
+ "learning_rate": 3.94660365002137e-06,
1216
+ "loss": 1.7921,
1217
+ "step": 161
1218
+ },
1219
+ {
1220
+ "epoch": 1.1911764705882353,
1221
+ "grad_norm": 1.03125,
1222
+ "learning_rate": 3.887395330218429e-06,
1223
+ "loss": 1.492,
1224
+ "step": 162
1225
+ },
1226
+ {
1227
+ "epoch": 1.1985294117647058,
1228
+ "grad_norm": 1.1875,
1229
+ "learning_rate": 3.8283507055498886e-06,
1230
+ "loss": 1.9106,
1231
+ "step": 163
1232
+ },
1233
+ {
1234
+ "epoch": 1.2058823529411764,
1235
+ "grad_norm": 1.0859375,
1236
+ "learning_rate": 3.7694784631245066e-06,
1237
+ "loss": 1.539,
1238
+ "step": 164
1239
+ },
1240
+ {
1241
+ "epoch": 1.213235294117647,
1242
+ "grad_norm": 1.0390625,
1243
+ "learning_rate": 3.7107872646888115e-06,
1244
+ "loss": 1.5264,
1245
+ "step": 165
1246
+ },
1247
+ {
1248
+ "epoch": 1.2205882352941178,
1249
+ "grad_norm": 1.0703125,
1250
+ "learning_rate": 3.6522857453527172e-06,
1251
+ "loss": 1.4314,
1252
+ "step": 166
1253
+ },
1254
+ {
1255
+ "epoch": 1.2279411764705883,
1256
+ "grad_norm": 0.88671875,
1257
+ "learning_rate": 3.5939825123190637e-06,
1258
+ "loss": 1.0555,
1259
+ "step": 167
1260
+ },
1261
+ {
1262
+ "epoch": 1.2352941176470589,
1263
+ "grad_norm": 1.0546875,
1264
+ "learning_rate": 3.5358861436172487e-06,
1265
+ "loss": 1.8202,
1266
+ "step": 168
1267
+ },
1268
+ {
1269
+ "epoch": 1.2426470588235294,
1270
+ "grad_norm": 1.453125,
1271
+ "learning_rate": 3.478005186841167e-06,
1272
+ "loss": 1.777,
1273
+ "step": 169
1274
+ },
1275
+ {
1276
+ "epoch": 1.25,
1277
+ "grad_norm": 1.953125,
1278
+ "learning_rate": 3.4203481578916197e-06,
1279
+ "loss": 2.5733,
1280
+ "step": 170
1281
+ },
1282
+ {
1283
+ "epoch": 1.25,
1284
+ "eval_loss": 1.9800699949264526,
1285
+ "eval_runtime": 28.6845,
1286
+ "eval_samples_per_second": 1.534,
1287
+ "eval_steps_per_second": 0.383,
1288
+ "step": 170
1289
+ },
1290
+ {
1291
+ "epoch": 1.2573529411764706,
1292
+ "grad_norm": 1.0859375,
1293
+ "learning_rate": 3.3629235397233894e-06,
1294
+ "loss": 1.8338,
1295
+ "step": 171
1296
+ },
1297
+ {
1298
+ "epoch": 1.2647058823529411,
1299
+ "grad_norm": 1.1796875,
1300
+ "learning_rate": 3.305739781097157e-06,
1301
+ "loss": 1.7603,
1302
+ "step": 172
1303
+ },
1304
+ {
1305
+ "epoch": 1.2720588235294117,
1306
+ "grad_norm": 1.0859375,
1307
+ "learning_rate": 3.248805295336458e-06,
1308
+ "loss": 1.6503,
1309
+ "step": 173
1310
+ },
1311
+ {
1312
+ "epoch": 1.2794117647058822,
1313
+ "grad_norm": 1.0390625,
1314
+ "learning_rate": 3.192128459089846e-06,
1315
+ "loss": 1.4681,
1316
+ "step": 174
1317
+ },
1318
+ {
1319
+ "epoch": 1.2867647058823528,
1320
+ "grad_norm": 1.1640625,
1321
+ "learning_rate": 3.1357176110984578e-06,
1322
+ "loss": 1.7137,
1323
+ "step": 175
1324
+ },
1325
+ {
1326
+ "epoch": 1.2941176470588236,
1327
+ "grad_norm": 1.0078125,
1328
+ "learning_rate": 3.079581050969146e-06,
1329
+ "loss": 1.7776,
1330
+ "step": 176
1331
+ },
1332
+ {
1333
+ "epoch": 1.3014705882352942,
1334
+ "grad_norm": 0.96484375,
1335
+ "learning_rate": 3.0237270379533823e-06,
1336
+ "loss": 1.6797,
1337
+ "step": 177
1338
+ },
1339
+ {
1340
+ "epoch": 1.3088235294117647,
1341
+ "grad_norm": 1.1015625,
1342
+ "learning_rate": 2.968163789732087e-06,
1343
+ "loss": 1.6639,
1344
+ "step": 178
1345
+ },
1346
+ {
1347
+ "epoch": 1.3161764705882353,
1348
+ "grad_norm": 1.1796875,
1349
+ "learning_rate": 2.912899481206582e-06,
1350
+ "loss": 1.9166,
1351
+ "step": 179
1352
+ },
1353
+ {
1354
+ "epoch": 1.3235294117647058,
1355
+ "grad_norm": 1.3203125,
1356
+ "learning_rate": 2.8579422432958316e-06,
1357
+ "loss": 1.8786,
1358
+ "step": 180
1359
+ },
1360
+ {
1361
+ "epoch": 1.3308823529411764,
1362
+ "grad_norm": 1.09375,
1363
+ "learning_rate": 2.803300161740166e-06,
1364
+ "loss": 1.6681,
1365
+ "step": 181
1366
+ },
1367
+ {
1368
+ "epoch": 1.3382352941176472,
1369
+ "grad_norm": 0.93359375,
1370
+ "learning_rate": 2.748981275911633e-06,
1371
+ "loss": 1.4445,
1372
+ "step": 182
1373
+ },
1374
+ {
1375
+ "epoch": 1.3455882352941178,
1376
+ "grad_norm": 1.34375,
1377
+ "learning_rate": 2.69499357763119e-06,
1378
+ "loss": 1.6979,
1379
+ "step": 183
1380
+ },
1381
+ {
1382
+ "epoch": 1.3529411764705883,
1383
+ "grad_norm": 1.2890625,
1384
+ "learning_rate": 2.641345009992878e-06,
1385
+ "loss": 1.9119,
1386
+ "step": 184
1387
+ },
1388
+ {
1389
+ "epoch": 1.3602941176470589,
1390
+ "grad_norm": 1.09375,
1391
+ "learning_rate": 2.5880434661951826e-06,
1392
+ "loss": 1.5965,
1393
+ "step": 185
1394
+ },
1395
+ {
1396
+ "epoch": 1.3676470588235294,
1397
+ "grad_norm": 0.9765625,
1398
+ "learning_rate": 2.5350967883797095e-06,
1399
+ "loss": 1.5683,
1400
+ "step": 186
1401
+ },
1402
+ {
1403
+ "epoch": 1.375,
1404
+ "grad_norm": 1.1015625,
1405
+ "learning_rate": 2.4825127664774008e-06,
1406
+ "loss": 1.8934,
1407
+ "step": 187
1408
+ },
1409
+ {
1410
+ "epoch": 1.375,
1411
+ "eval_loss": 1.9797371625900269,
1412
+ "eval_runtime": 29.165,
1413
+ "eval_samples_per_second": 1.509,
1414
+ "eval_steps_per_second": 0.377,
1415
+ "step": 187
1416
+ },
1417
+ {
1418
+ "epoch": 1.3823529411764706,
1419
+ "grad_norm": 1.0546875,
1420
+ "learning_rate": 2.4302991370624106e-06,
1421
+ "loss": 1.6543,
1422
+ "step": 188
1423
+ },
1424
+ {
1425
+ "epoch": 1.3897058823529411,
1426
+ "grad_norm": 0.9921875,
1427
+ "learning_rate": 2.3784635822138424e-06,
1428
+ "loss": 1.6389,
1429
+ "step": 189
1430
+ },
1431
+ {
1432
+ "epoch": 1.3970588235294117,
1433
+ "grad_norm": 1.03125,
1434
+ "learning_rate": 2.3270137283855022e-06,
1435
+ "loss": 1.2686,
1436
+ "step": 190
1437
+ },
1438
+ {
1439
+ "epoch": 1.4044117647058822,
1440
+ "grad_norm": 1.1015625,
1441
+ "learning_rate": 2.2759571452838325e-06,
1442
+ "loss": 1.7589,
1443
+ "step": 191
1444
+ },
1445
+ {
1446
+ "epoch": 1.4117647058823528,
1447
+ "grad_norm": 0.99609375,
1448
+ "learning_rate": 2.2253013447541993e-06,
1449
+ "loss": 1.2093,
1450
+ "step": 192
1451
+ },
1452
+ {
1453
+ "epoch": 1.4191176470588236,
1454
+ "grad_norm": 1.0703125,
1455
+ "learning_rate": 2.17505377967569e-06,
1456
+ "loss": 1.9398,
1457
+ "step": 193
1458
+ },
1459
+ {
1460
+ "epoch": 1.4264705882352942,
1461
+ "grad_norm": 1.3671875,
1462
+ "learning_rate": 2.125221842864585e-06,
1463
+ "loss": 1.7399,
1464
+ "step": 194
1465
+ },
1466
+ {
1467
+ "epoch": 1.4338235294117647,
1468
+ "grad_norm": 1.03125,
1469
+ "learning_rate": 2.075812865986677e-06,
1470
+ "loss": 1.4623,
1471
+ "step": 195
1472
+ },
1473
+ {
1474
+ "epoch": 1.4411764705882353,
1475
+ "grad_norm": 0.87109375,
1476
+ "learning_rate": 2.0268341184785674e-06,
1477
+ "loss": 1.303,
1478
+ "step": 196
1479
+ },
1480
+ {
1481
+ "epoch": 1.4485294117647058,
1482
+ "grad_norm": 0.76953125,
1483
+ "learning_rate": 1.978292806478134e-06,
1484
+ "loss": 1.0736,
1485
+ "step": 197
1486
+ },
1487
+ {
1488
+ "epoch": 1.4558823529411764,
1489
+ "grad_norm": 1.140625,
1490
+ "learning_rate": 1.930196071764312e-06,
1491
+ "loss": 1.7327,
1492
+ "step": 198
1493
+ },
1494
+ {
1495
+ "epoch": 1.4632352941176472,
1496
+ "grad_norm": 1.296875,
1497
+ "learning_rate": 1.8825509907063328e-06,
1498
+ "loss": 2.01,
1499
+ "step": 199
1500
+ },
1501
+ {
1502
+ "epoch": 1.4705882352941178,
1503
+ "grad_norm": 2.65625,
1504
+ "learning_rate": 1.8353645732225977e-06,
1505
+ "loss": 3.1163,
1506
+ "step": 200
1507
+ },
1508
+ {
1509
+ "epoch": 1.4779411764705883,
1510
+ "grad_norm": 1.0625,
1511
+ "learning_rate": 1.7886437617493206e-06,
1512
+ "loss": 1.9975,
1513
+ "step": 201
1514
+ },
1515
+ {
1516
+ "epoch": 1.4852941176470589,
1517
+ "grad_norm": 1.0390625,
1518
+ "learning_rate": 1.7423954302191047e-06,
1519
+ "loss": 1.8281,
1520
+ "step": 202
1521
+ },
1522
+ {
1523
+ "epoch": 1.4926470588235294,
1524
+ "grad_norm": 0.98828125,
1525
+ "learning_rate": 1.6966263830495939e-06,
1526
+ "loss": 1.8038,
1527
+ "step": 203
1528
+ },
1529
+ {
1530
+ "epoch": 1.5,
1531
+ "grad_norm": 1.125,
1532
+ "learning_rate": 1.6513433541423529e-06,
1533
+ "loss": 1.9386,
1534
+ "step": 204
1535
+ },
1536
+ {
1537
+ "epoch": 1.5,
1538
+ "eval_loss": 1.9795974493026733,
1539
+ "eval_runtime": 27.8777,
1540
+ "eval_samples_per_second": 1.578,
1541
+ "eval_steps_per_second": 0.395,
1542
+ "step": 204
1543
+ },
1544
+ {
1545
+ "epoch": 1.5073529411764706,
1546
+ "grad_norm": 0.78515625,
1547
+ "learning_rate": 1.6065530058921253e-06,
1548
+ "loss": 1.2361,
1549
+ "step": 205
1550
+ },
1551
+ {
1552
+ "epoch": 1.5147058823529411,
1553
+ "grad_norm": 1.0546875,
1554
+ "learning_rate": 1.562261928206608e-06,
1555
+ "loss": 1.6261,
1556
+ "step": 206
1557
+ },
1558
+ {
1559
+ "epoch": 1.5220588235294117,
1560
+ "grad_norm": 1.078125,
1561
+ "learning_rate": 1.5184766375368914e-06,
1562
+ "loss": 1.6148,
1563
+ "step": 207
1564
+ },
1565
+ {
1566
+ "epoch": 1.5294117647058822,
1567
+ "grad_norm": 1.0859375,
1568
+ "learning_rate": 1.4752035759187106e-06,
1569
+ "loss": 1.5313,
1570
+ "step": 208
1571
+ },
1572
+ {
1573
+ "epoch": 1.5367647058823528,
1574
+ "grad_norm": 1.078125,
1575
+ "learning_rate": 1.4324491100246386e-06,
1576
+ "loss": 1.6464,
1577
+ "step": 209
1578
+ },
1579
+ {
1580
+ "epoch": 1.5441176470588234,
1581
+ "grad_norm": 1.234375,
1582
+ "learning_rate": 1.390219530227378e-06,
1583
+ "loss": 1.9547,
1584
+ "step": 210
1585
+ },
1586
+ {
1587
+ "epoch": 1.5514705882352942,
1588
+ "grad_norm": 0.95703125,
1589
+ "learning_rate": 1.348521049674264e-06,
1590
+ "loss": 1.6224,
1591
+ "step": 211
1592
+ },
1593
+ {
1594
+ "epoch": 1.5588235294117647,
1595
+ "grad_norm": 1.015625,
1596
+ "learning_rate": 1.3073598033731427e-06,
1597
+ "loss": 1.5813,
1598
+ "step": 212
1599
+ },
1600
+ {
1601
+ "epoch": 1.5661764705882353,
1602
+ "grad_norm": 1.09375,
1603
+ "learning_rate": 1.2667418472897386e-06,
1604
+ "loss": 1.7288,
1605
+ "step": 213
1606
+ },
1607
+ {
1608
+ "epoch": 1.5735294117647058,
1609
+ "grad_norm": 1.2578125,
1610
+ "learning_rate": 1.2266731574566536e-06,
1611
+ "loss": 1.6956,
1612
+ "step": 214
1613
+ },
1614
+ {
1615
+ "epoch": 1.5808823529411766,
1616
+ "grad_norm": 0.921875,
1617
+ "learning_rate": 1.1871596290941278e-06,
1618
+ "loss": 1.3637,
1619
+ "step": 215
1620
+ },
1621
+ {
1622
+ "epoch": 1.5882352941176472,
1623
+ "grad_norm": 0.96484375,
1624
+ "learning_rate": 1.1482070757426855e-06,
1625
+ "loss": 1.3508,
1626
+ "step": 216
1627
+ },
1628
+ {
1629
+ "epoch": 1.5955882352941178,
1630
+ "grad_norm": 0.98828125,
1631
+ "learning_rate": 1.1098212284078037e-06,
1632
+ "loss": 2.0166,
1633
+ "step": 217
1634
+ },
1635
+ {
1636
+ "epoch": 1.6029411764705883,
1637
+ "grad_norm": 1.0390625,
1638
+ "learning_rate": 1.07200773471672e-06,
1639
+ "loss": 1.6187,
1640
+ "step": 218
1641
+ },
1642
+ {
1643
+ "epoch": 1.6102941176470589,
1644
+ "grad_norm": 1.2109375,
1645
+ "learning_rate": 1.0347721580875125e-06,
1646
+ "loss": 1.9056,
1647
+ "step": 219
1648
+ },
1649
+ {
1650
+ "epoch": 1.6176470588235294,
1651
+ "grad_norm": 1.75,
1652
+ "learning_rate": 9.981199769105605e-07,
1653
+ "loss": 2.6408,
1654
+ "step": 220
1655
+ },
1656
+ {
1657
+ "epoch": 1.625,
1658
+ "grad_norm": 1.0546875,
1659
+ "learning_rate": 9.62056583742527e-07,
1660
+ "loss": 1.6764,
1661
+ "step": 221
1662
+ },
1663
+ {
1664
+ "epoch": 1.625,
1665
+ "eval_loss": 1.9795591831207275,
1666
+ "eval_runtime": 29.1758,
1667
+ "eval_samples_per_second": 1.508,
1668
+ "eval_steps_per_second": 0.377,
1669
+ "step": 221
1670
+ },
1671
+ {
1672
+ "epoch": 1.6323529411764706,
1673
+ "grad_norm": 1.2734375,
1674
+ "learning_rate": 9.26587284512957e-07,
1675
+ "loss": 1.6724,
1676
+ "step": 222
1677
+ },
1678
+ {
1679
+ "epoch": 1.6397058823529411,
1680
+ "grad_norm": 1.1328125,
1681
+ "learning_rate": 8.917172977436356e-07,
1682
+ "loss": 1.2422,
1683
+ "step": 223
1684
+ },
1685
+ {
1686
+ "epoch": 1.6470588235294117,
1687
+ "grad_norm": 1.1953125,
1688
+ "learning_rate": 8.574517537807897e-07,
1689
+ "loss": 1.543,
1690
+ "step": 224
1691
+ },
1692
+ {
1693
+ "epoch": 1.6544117647058822,
1694
+ "grad_norm": 1.0703125,
1695
+ "learning_rate": 8.237956940402758e-07,
1696
+ "loss": 1.2271,
1697
+ "step": 225
1698
+ },
1699
+ {
1700
+ "epoch": 1.6617647058823528,
1701
+ "grad_norm": 0.94921875,
1702
+ "learning_rate": 7.907540702658456e-07,
1703
+ "loss": 1.3496,
1704
+ "step": 226
1705
+ },
1706
+ {
1707
+ "epoch": 1.6691176470588234,
1708
+ "grad_norm": 1.0234375,
1709
+ "learning_rate": 7.583317438006094e-07,
1710
+ "loss": 1.565,
1711
+ "step": 227
1712
+ },
1713
+ {
1714
+ "epoch": 1.6764705882352942,
1715
+ "grad_norm": 1.40625,
1716
+ "learning_rate": 7.265334848717931e-07,
1717
+ "loss": 1.6159,
1718
+ "step": 228
1719
+ },
1720
+ {
1721
+ "epoch": 1.6838235294117647,
1722
+ "grad_norm": 0.89453125,
1723
+ "learning_rate": 6.953639718889077e-07,
1724
+ "loss": 1.3744,
1725
+ "step": 229
1726
+ },
1727
+ {
1728
+ "epoch": 1.6911764705882353,
1729
+ "grad_norm": 1.0625,
1730
+ "learning_rate": 6.648277907554235e-07,
1731
+ "loss": 1.5341,
1732
+ "step": 230
1733
+ },
1734
+ {
1735
+ "epoch": 1.6985294117647058,
1736
+ "grad_norm": 1.0234375,
1737
+ "learning_rate": 6.349294341940593e-07,
1738
+ "loss": 1.7079,
1739
+ "step": 231
1740
+ },
1741
+ {
1742
+ "epoch": 1.7058823529411766,
1743
+ "grad_norm": 0.9375,
1744
+ "learning_rate": 6.056733010857713e-07,
1745
+ "loss": 1.5848,
1746
+ "step": 232
1747
+ },
1748
+ {
1749
+ "epoch": 1.7132352941176472,
1750
+ "grad_norm": 1.0859375,
1751
+ "learning_rate": 5.770636958225617e-07,
1752
+ "loss": 1.7697,
1753
+ "step": 233
1754
+ },
1755
+ {
1756
+ "epoch": 1.7205882352941178,
1757
+ "grad_norm": 1.4765625,
1758
+ "learning_rate": 5.491048276741784e-07,
1759
+ "loss": 2.3449,
1760
+ "step": 234
1761
+ },
1762
+ {
1763
+ "epoch": 1.7279411764705883,
1764
+ "grad_norm": 0.95703125,
1765
+ "learning_rate": 5.218008101688172e-07,
1766
+ "loss": 1.5532,
1767
+ "step": 235
1768
+ },
1769
+ {
1770
+ "epoch": 1.7352941176470589,
1771
+ "grad_norm": 1.2265625,
1772
+ "learning_rate": 4.951556604879049e-07,
1773
+ "loss": 1.9012,
1774
+ "step": 236
1775
+ },
1776
+ {
1777
+ "epoch": 1.7426470588235294,
1778
+ "grad_norm": 0.91796875,
1779
+ "learning_rate": 4.6917329887506133e-07,
1780
+ "loss": 1.6908,
1781
+ "step": 237
1782
+ },
1783
+ {
1784
+ "epoch": 1.75,
1785
+ "grad_norm": 1.1640625,
1786
+ "learning_rate": 4.43857548059321e-07,
1787
+ "loss": 1.8524,
1788
+ "step": 238
1789
+ },
1790
+ {
1791
+ "epoch": 1.75,
1792
+ "eval_loss": 1.9795602560043335,
1793
+ "eval_runtime": 28.9797,
1794
+ "eval_samples_per_second": 1.518,
1795
+ "eval_steps_per_second": 0.38,
1796
+ "step": 238
1797
+ },
1798
+ {
1799
+ "epoch": 1.7573529411764706,
1800
+ "grad_norm": 1.0234375,
1801
+ "learning_rate": 4.192121326927073e-07,
1802
+ "loss": 1.681,
1803
+ "step": 239
1804
+ },
1805
+ {
1806
+ "epoch": 1.7647058823529411,
1807
+ "grad_norm": 1.09375,
1808
+ "learning_rate": 3.952406788022267e-07,
1809
+ "loss": 1.4955,
1810
+ "step": 240
1811
+ },
1812
+ {
1813
+ "epoch": 1.7720588235294117,
1814
+ "grad_norm": 0.87109375,
1815
+ "learning_rate": 3.71946713256382e-07,
1816
+ "loss": 1.2862,
1817
+ "step": 241
1818
+ },
1819
+ {
1820
+ "epoch": 1.7794117647058822,
1821
+ "grad_norm": 1.1796875,
1822
+ "learning_rate": 3.4933366324627183e-07,
1823
+ "loss": 1.7497,
1824
+ "step": 242
1825
+ },
1826
+ {
1827
+ "epoch": 1.7867647058823528,
1828
+ "grad_norm": 1.078125,
1829
+ "learning_rate": 3.274048557813553e-07,
1830
+ "loss": 1.8767,
1831
+ "step": 243
1832
+ },
1833
+ {
1834
+ "epoch": 1.7941176470588234,
1835
+ "grad_norm": 1.015625,
1836
+ "learning_rate": 3.061635171999566e-07,
1837
+ "loss": 1.6158,
1838
+ "step": 244
1839
+ },
1840
+ {
1841
+ "epoch": 1.8014705882352942,
1842
+ "grad_norm": 1.03125,
1843
+ "learning_rate": 2.85612772694579e-07,
1844
+ "loss": 1.9836,
1845
+ "step": 245
1846
+ },
1847
+ {
1848
+ "epoch": 1.8088235294117647,
1849
+ "grad_norm": 0.953125,
1850
+ "learning_rate": 2.6575564585210487e-07,
1851
+ "loss": 1.6339,
1852
+ "step": 246
1853
+ },
1854
+ {
1855
+ "epoch": 1.8161764705882353,
1856
+ "grad_norm": 1.1640625,
1857
+ "learning_rate": 2.4659505820893827e-07,
1858
+ "loss": 1.9937,
1859
+ "step": 247
1860
+ },
1861
+ {
1862
+ "epoch": 1.8235294117647058,
1863
+ "grad_norm": 0.9609375,
1864
+ "learning_rate": 2.2813382882116986e-07,
1865
+ "loss": 1.3654,
1866
+ "step": 248
1867
+ },
1868
+ {
1869
+ "epoch": 1.8308823529411766,
1870
+ "grad_norm": 0.99609375,
1871
+ "learning_rate": 2.1037467384981024e-07,
1872
+ "loss": 1.7286,
1873
+ "step": 249
1874
+ },
1875
+ {
1876
+ "epoch": 1.8382352941176472,
1877
+ "grad_norm": 1.34375,
1878
+ "learning_rate": 1.933202061611722e-07,
1879
+ "loss": 1.6418,
1880
+ "step": 250
1881
+ },
1882
+ {
1883
+ "epoch": 1.8455882352941178,
1884
+ "grad_norm": 0.94921875,
1885
+ "learning_rate": 1.769729349424415e-07,
1886
+ "loss": 1.8501,
1887
+ "step": 251
1888
+ },
1889
+ {
1890
+ "epoch": 1.8529411764705883,
1891
+ "grad_norm": 1.15625,
1892
+ "learning_rate": 1.6133526533250566e-07,
1893
+ "loss": 1.876,
1894
+ "step": 252
1895
+ },
1896
+ {
1897
+ "epoch": 1.8602941176470589,
1898
+ "grad_norm": 0.9921875,
1899
+ "learning_rate": 1.4640949806809523e-07,
1900
+ "loss": 1.6815,
1901
+ "step": 253
1902
+ },
1903
+ {
1904
+ "epoch": 1.8676470588235294,
1905
+ "grad_norm": 1.1953125,
1906
+ "learning_rate": 1.3219782914527633e-07,
1907
+ "loss": 2.1211,
1908
+ "step": 254
1909
+ },
1910
+ {
1911
+ "epoch": 1.875,
1912
+ "grad_norm": 1.1171875,
1913
+ "learning_rate": 1.1870234949636072e-07,
1914
+ "loss": 1.661,
1915
+ "step": 255
1916
+ },
1917
+ {
1918
+ "epoch": 1.875,
1919
+ "eval_loss": 1.979546308517456,
1920
+ "eval_runtime": 28.9374,
1921
+ "eval_samples_per_second": 1.521,
1922
+ "eval_steps_per_second": 0.38,
1923
+ "step": 255
1924
+ },
1925
+ {
1926
+ "epoch": 1.8823529411764706,
1927
+ "grad_norm": 1.0625,
1928
+ "learning_rate": 1.0592504468227127e-07,
1929
+ "loss": 1.8337,
1930
+ "step": 256
1931
+ },
1932
+ {
1933
+ "epoch": 1.8897058823529411,
1934
+ "grad_norm": 1.015625,
1935
+ "learning_rate": 9.386779460041018e-08,
1936
+ "loss": 1.7393,
1937
+ "step": 257
1938
+ },
1939
+ {
1940
+ "epoch": 1.8970588235294117,
1941
+ "grad_norm": 1.1015625,
1942
+ "learning_rate": 8.253237320807461e-08,
1943
+ "loss": 1.4356,
1944
+ "step": 258
1945
+ },
1946
+ {
1947
+ "epoch": 1.9044117647058822,
1948
+ "grad_norm": 1.328125,
1949
+ "learning_rate": 7.192044826145772e-08,
1950
+ "loss": 1.8109,
1951
+ "step": 259
1952
+ },
1953
+ {
1954
+ "epoch": 1.9117647058823528,
1955
+ "grad_norm": 1.234375,
1956
+ "learning_rate": 6.203358107027491e-08,
1957
+ "loss": 1.9221,
1958
+ "step": 260
1959
+ },
1960
+ {
1961
+ "epoch": 1.9191176470588234,
1962
+ "grad_norm": 1.1796875,
1963
+ "learning_rate": 5.2873226268052026e-08,
1964
+ "loss": 1.611,
1965
+ "step": 261
1966
+ },
1967
+ {
1968
+ "epoch": 1.9264705882352942,
1969
+ "grad_norm": 1.2421875,
1970
+ "learning_rate": 4.444073159810769e-08,
1971
+ "loss": 1.7129,
1972
+ "step": 262
1973
+ },
1974
+ {
1975
+ "epoch": 1.9338235294117647,
1976
+ "grad_norm": 1.046875,
1977
+ "learning_rate": 3.673733771526466e-08,
1978
+ "loss": 1.3399,
1979
+ "step": 263
1980
+ },
1981
+ {
1982
+ "epoch": 1.9411764705882353,
1983
+ "grad_norm": 0.88671875,
1984
+ "learning_rate": 2.976417800331144e-08,
1985
+ "loss": 1.1532,
1986
+ "step": 264
1987
+ },
1988
+ {
1989
+ "epoch": 1.9485294117647058,
1990
+ "grad_norm": 1.2265625,
1991
+ "learning_rate": 2.352227840825394e-08,
1992
+ "loss": 2.0323,
1993
+ "step": 265
1994
+ },
1995
+ {
1996
+ "epoch": 1.9558823529411766,
1997
+ "grad_norm": 1.0234375,
1998
+ "learning_rate": 1.8012557287367394e-08,
1999
+ "loss": 1.7834,
2000
+ "step": 266
2001
+ },
2002
+ {
2003
+ "epoch": 1.9632352941176472,
2004
+ "grad_norm": 1.0859375,
2005
+ "learning_rate": 1.3235825274081626e-08,
2006
+ "loss": 1.6388,
2007
+ "step": 267
2008
+ },
2009
+ {
2010
+ "epoch": 1.9705882352941178,
2011
+ "grad_norm": 1.1328125,
2012
+ "learning_rate": 9.192785158713691e-09,
2013
+ "loss": 1.8893,
2014
+ "step": 268
2015
+ },
2016
+ {
2017
+ "epoch": 1.9779411764705883,
2018
+ "grad_norm": 1.0234375,
2019
+ "learning_rate": 5.884031785068356e-09,
2020
+ "loss": 1.4061,
2021
+ "step": 269
2022
+ },
2023
+ {
2024
+ "epoch": 1.9852941176470589,
2025
+ "grad_norm": 1.2578125,
2026
+ "learning_rate": 3.3100519629203353e-09,
2027
+ "loss": 1.8636,
2028
+ "step": 270
2029
+ },
2030
+ {
2031
+ "epoch": 1.9926470588235294,
2032
+ "grad_norm": 0.86328125,
2033
+ "learning_rate": 1.471224396389359e-09,
2034
+ "loss": 0.9198,
2035
+ "step": 271
2036
+ },
2037
+ {
2038
+ "epoch": 2.0,
2039
+ "grad_norm": 1.3359375,
2040
+ "learning_rate": 3.678196282252966e-10,
2041
+ "loss": 1.8697,
2042
+ "step": 272
2043
+ },
2044
+ {
2045
+ "epoch": 2.0,
2046
+ "eval_loss": 1.979569435119629,
2047
+ "eval_runtime": 27.7469,
2048
+ "eval_samples_per_second": 1.586,
2049
+ "eval_steps_per_second": 0.396,
2050
+ "step": 272
2051
+ }
2052
+ ],
2053
+ "logging_steps": 1,
2054
+ "max_steps": 272,
2055
+ "num_input_tokens_seen": 0,
2056
+ "num_train_epochs": 2,
2057
+ "save_steps": 136,
2058
+ "stateful_callbacks": {
2059
+ "TrainerControl": {
2060
+ "args": {
2061
+ "should_epoch_stop": false,
2062
+ "should_evaluate": false,
2063
+ "should_log": false,
2064
+ "should_save": true,
2065
+ "should_training_stop": true
2066
+ },
2067
+ "attributes": {}
2068
+ }
2069
+ },
2070
+ "total_flos": 1.618953940445954e+18,
2071
+ "train_batch_size": 1,
2072
+ "trial_name": null,
2073
+ "trial_params": null
2074
+ }
checkpoint-272/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:982309b232617b9bfc0df4d297ea42402a906069dd07d503a1b422c96356e1aa
3
+ size 10872
checkpoint-272/vocab.json ADDED
The diff for this file is too large to render. See raw diff