danielhanchen commited on
Commit
8e566f2
·
verified ·
1 Parent(s): e5abf57

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -58,7 +58,7 @@
58
  "num_key_value_heads": 8,
59
  "num_local_experts": 128,
60
  "output_router_logits": false,
61
- "pad_token_id": 199999,
62
  "quantization_config": {
63
  "modules_to_not_convert": [
64
  "model.layers.*.self_attn",
@@ -82,7 +82,8 @@
82
  "sliding_window": 128,
83
  "swiglu_limit": 7.0,
84
  "tie_word_embeddings": false,
85
- "transformers_version": "4.55.0.dev0",
 
86
  "use_cache": true,
87
  "vocab_size": 201088
88
  }
 
58
  "num_key_value_heads": 8,
59
  "num_local_experts": 128,
60
  "output_router_logits": false,
61
+ "pad_token_id": 200017,
62
  "quantization_config": {
63
  "modules_to_not_convert": [
64
  "model.layers.*.self_attn",
 
82
  "sliding_window": 128,
83
  "swiglu_limit": 7.0,
84
  "tie_word_embeddings": false,
85
+ "transformers_version": "4.56.0.dev0",
86
+ "unsloth_fixed": true,
87
  "use_cache": true,
88
  "vocab_size": 201088
89
  }
original/model--00001-of-00007.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:68a8dc1f8e2e5996cb702f14332a25ddf3463daeab2df68e21ca09ef181203c3
3
- size 10544040680
 
 
 
 
original/model--00002-of-00007.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:19b8f0d5c7dc3195c61a711d08384a1f85624f018186da541585c0f97ac61020
3
- size 10488721680
 
 
 
 
original/model--00003-of-00007.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dbccd746d50e9543e8016d0a43ab4487c7f86d72349b1ef17abdfec509d0701
3
- size 10488721688
 
 
 
 
original/model--00004-of-00007.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bcc73cf6d18f96a2e62428758463157cc12768f410873152a50d3929a64cd049
3
- size 10488721672
 
 
 
 
original/model--00005-of-00007.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:15fd69843e9cc6fdf2db0efe0cf0979b49a6ba84b3a38169b2fabc5479d04a7d
3
- size 10488721680
 
 
 
 
original/model--00006-of-00007.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3aedef2ee0a5a78a003b3f74fd6883033946b80097bf41e4f4715d95066f0588
3
- size 10433402600
 
 
 
 
original/model--00007-of-00007.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:20d5dfcad1ed6c50aa3c0da7d3f08828dba72b5f58686a987bf3a8f01659cda6
3
- size 2316539800
 
 
 
 
original/model.safetensors.index.json DELETED
@@ -1,550 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_size": 65248815744
4
- },
5
- "weight_map": {
6
- "block.0.attn.norm.scale": "model--00001-of-00007.safetensors",
7
- "block.0.attn.out.bias": "model--00001-of-00007.safetensors",
8
- "block.0.attn.out.weight": "model--00001-of-00007.safetensors",
9
- "block.0.attn.qkv.bias": "model--00001-of-00007.safetensors",
10
- "block.0.attn.qkv.weight": "model--00001-of-00007.safetensors",
11
- "block.0.attn.sinks": "model--00001-of-00007.safetensors",
12
- "block.0.mlp.gate.bias": "model--00001-of-00007.safetensors",
13
- "block.0.mlp.gate.weight": "model--00001-of-00007.safetensors",
14
- "block.0.mlp.mlp1_bias": "model--00001-of-00007.safetensors",
15
- "block.0.mlp.mlp1_weight.blocks": "model--00001-of-00007.safetensors",
16
- "block.0.mlp.mlp1_weight.scales": "model--00001-of-00007.safetensors",
17
- "block.0.mlp.mlp2_bias": "model--00001-of-00007.safetensors",
18
- "block.0.mlp.mlp2_weight.blocks": "model--00001-of-00007.safetensors",
19
- "block.0.mlp.mlp2_weight.scales": "model--00001-of-00007.safetensors",
20
- "block.0.mlp.norm.scale": "model--00001-of-00007.safetensors",
21
- "block.1.attn.norm.scale": "model--00001-of-00007.safetensors",
22
- "block.1.attn.out.bias": "model--00001-of-00007.safetensors",
23
- "block.1.attn.out.weight": "model--00001-of-00007.safetensors",
24
- "block.1.attn.qkv.bias": "model--00001-of-00007.safetensors",
25
- "block.1.attn.qkv.weight": "model--00001-of-00007.safetensors",
26
- "block.1.attn.sinks": "model--00001-of-00007.safetensors",
27
- "block.1.mlp.gate.bias": "model--00001-of-00007.safetensors",
28
- "block.1.mlp.gate.weight": "model--00001-of-00007.safetensors",
29
- "block.1.mlp.mlp1_bias": "model--00001-of-00007.safetensors",
30
- "block.1.mlp.mlp1_weight.blocks": "model--00001-of-00007.safetensors",
31
- "block.1.mlp.mlp1_weight.scales": "model--00001-of-00007.safetensors",
32
- "block.1.mlp.mlp2_bias": "model--00001-of-00007.safetensors",
33
- "block.1.mlp.mlp2_weight.blocks": "model--00001-of-00007.safetensors",
34
- "block.1.mlp.mlp2_weight.scales": "model--00001-of-00007.safetensors",
35
- "block.1.mlp.norm.scale": "model--00001-of-00007.safetensors",
36
- "block.10.attn.norm.scale": "model--00001-of-00007.safetensors",
37
- "block.10.attn.out.bias": "model--00001-of-00007.safetensors",
38
- "block.10.attn.out.weight": "model--00001-of-00007.safetensors",
39
- "block.10.attn.qkv.bias": "model--00001-of-00007.safetensors",
40
- "block.10.attn.qkv.weight": "model--00001-of-00007.safetensors",
41
- "block.10.attn.sinks": "model--00001-of-00007.safetensors",
42
- "block.10.mlp.gate.bias": "model--00001-of-00007.safetensors",
43
- "block.10.mlp.gate.weight": "model--00001-of-00007.safetensors",
44
- "block.10.mlp.mlp1_bias": "model--00001-of-00007.safetensors",
45
- "block.10.mlp.mlp1_weight.blocks": "model--00001-of-00007.safetensors",
46
- "block.10.mlp.mlp1_weight.scales": "model--00001-of-00007.safetensors",
47
- "block.10.mlp.mlp2_bias": "model--00001-of-00007.safetensors",
48
- "block.10.mlp.mlp2_weight.blocks": "model--00001-of-00007.safetensors",
49
- "block.10.mlp.mlp2_weight.scales": "model--00001-of-00007.safetensors",
50
- "block.10.mlp.norm.scale": "model--00001-of-00007.safetensors",
51
- "block.11.attn.norm.scale": "model--00001-of-00007.safetensors",
52
- "block.11.attn.out.bias": "model--00001-of-00007.safetensors",
53
- "block.11.attn.out.weight": "model--00001-of-00007.safetensors",
54
- "block.11.attn.qkv.bias": "model--00001-of-00007.safetensors",
55
- "block.11.attn.qkv.weight": "model--00001-of-00007.safetensors",
56
- "block.11.attn.sinks": "model--00001-of-00007.safetensors",
57
- "block.11.mlp.gate.bias": "model--00001-of-00007.safetensors",
58
- "block.11.mlp.gate.weight": "model--00001-of-00007.safetensors",
59
- "block.11.mlp.mlp1_bias": "model--00001-of-00007.safetensors",
60
- "block.11.mlp.mlp1_weight.blocks": "model--00001-of-00007.safetensors",
61
- "block.11.mlp.mlp1_weight.scales": "model--00001-of-00007.safetensors",
62
- "block.11.mlp.mlp2_bias": "model--00001-of-00007.safetensors",
63
- "block.11.mlp.mlp2_weight.blocks": "model--00001-of-00007.safetensors",
64
- "block.11.mlp.mlp2_weight.scales": "model--00001-of-00007.safetensors",
65
- "block.11.mlp.norm.scale": "model--00001-of-00007.safetensors",
66
- "block.12.attn.norm.scale": "model--00001-of-00007.safetensors",
67
- "block.12.attn.out.bias": "model--00001-of-00007.safetensors",
68
- "block.12.attn.out.weight": "model--00001-of-00007.safetensors",
69
- "block.12.attn.qkv.bias": "model--00001-of-00007.safetensors",
70
- "block.12.attn.qkv.weight": "model--00001-of-00007.safetensors",
71
- "block.12.attn.sinks": "model--00001-of-00007.safetensors",
72
- "block.12.mlp.gate.bias": "model--00001-of-00007.safetensors",
73
- "block.12.mlp.gate.weight": "model--00001-of-00007.safetensors",
74
- "block.12.mlp.mlp1_bias": "model--00001-of-00007.safetensors",
75
- "block.12.mlp.mlp1_weight.blocks": "model--00001-of-00007.safetensors",
76
- "block.12.mlp.mlp1_weight.scales": "model--00001-of-00007.safetensors",
77
- "block.12.mlp.mlp2_bias": "model--00001-of-00007.safetensors",
78
- "block.12.mlp.mlp2_weight.blocks": "model--00001-of-00007.safetensors",
79
- "block.12.mlp.mlp2_weight.scales": "model--00001-of-00007.safetensors",
80
- "block.12.mlp.norm.scale": "model--00001-of-00007.safetensors",
81
- "block.13.attn.norm.scale": "model--00001-of-00007.safetensors",
82
- "block.13.attn.out.bias": "model--00001-of-00007.safetensors",
83
- "block.13.attn.out.weight": "model--00001-of-00007.safetensors",
84
- "block.13.attn.qkv.bias": "model--00001-of-00007.safetensors",
85
- "block.13.attn.qkv.weight": "model--00001-of-00007.safetensors",
86
- "block.13.attn.sinks": "model--00001-of-00007.safetensors",
87
- "block.13.mlp.gate.bias": "model--00001-of-00007.safetensors",
88
- "block.13.mlp.gate.weight": "model--00001-of-00007.safetensors",
89
- "block.13.mlp.mlp1_bias": "model--00001-of-00007.safetensors",
90
- "block.13.mlp.mlp1_weight.blocks": "model--00001-of-00007.safetensors",
91
- "block.13.mlp.mlp1_weight.scales": "model--00001-of-00007.safetensors",
92
- "block.13.mlp.mlp2_bias": "model--00001-of-00007.safetensors",
93
- "block.13.mlp.mlp2_weight.blocks": "model--00001-of-00007.safetensors",
94
- "block.13.mlp.mlp2_weight.scales": "model--00001-of-00007.safetensors",
95
- "block.13.mlp.norm.scale": "model--00001-of-00007.safetensors",
96
- "block.14.attn.norm.scale": "model--00001-of-00007.safetensors",
97
- "block.14.attn.out.bias": "model--00001-of-00007.safetensors",
98
- "block.14.attn.out.weight": "model--00001-of-00007.safetensors",
99
- "block.14.attn.qkv.bias": "model--00001-of-00007.safetensors",
100
- "block.14.attn.qkv.weight": "model--00001-of-00007.safetensors",
101
- "block.14.attn.sinks": "model--00001-of-00007.safetensors",
102
- "block.14.mlp.gate.bias": "model--00001-of-00007.safetensors",
103
- "block.14.mlp.gate.weight": "model--00001-of-00007.safetensors",
104
- "block.14.mlp.mlp1_bias": "model--00001-of-00007.safetensors",
105
- "block.14.mlp.mlp1_weight.blocks": "model--00002-of-00007.safetensors",
106
- "block.14.mlp.mlp1_weight.scales": "model--00002-of-00007.safetensors",
107
- "block.14.mlp.mlp2_bias": "model--00002-of-00007.safetensors",
108
- "block.14.mlp.mlp2_weight.blocks": "model--00002-of-00007.safetensors",
109
- "block.14.mlp.mlp2_weight.scales": "model--00002-of-00007.safetensors",
110
- "block.14.mlp.norm.scale": "model--00002-of-00007.safetensors",
111
- "block.15.attn.norm.scale": "model--00002-of-00007.safetensors",
112
- "block.15.attn.out.bias": "model--00002-of-00007.safetensors",
113
- "block.15.attn.out.weight": "model--00002-of-00007.safetensors",
114
- "block.15.attn.qkv.bias": "model--00002-of-00007.safetensors",
115
- "block.15.attn.qkv.weight": "model--00002-of-00007.safetensors",
116
- "block.15.attn.sinks": "model--00002-of-00007.safetensors",
117
- "block.15.mlp.gate.bias": "model--00002-of-00007.safetensors",
118
- "block.15.mlp.gate.weight": "model--00002-of-00007.safetensors",
119
- "block.15.mlp.mlp1_bias": "model--00002-of-00007.safetensors",
120
- "block.15.mlp.mlp1_weight.blocks": "model--00002-of-00007.safetensors",
121
- "block.15.mlp.mlp1_weight.scales": "model--00002-of-00007.safetensors",
122
- "block.15.mlp.mlp2_bias": "model--00002-of-00007.safetensors",
123
- "block.15.mlp.mlp2_weight.blocks": "model--00002-of-00007.safetensors",
124
- "block.15.mlp.mlp2_weight.scales": "model--00002-of-00007.safetensors",
125
- "block.15.mlp.norm.scale": "model--00002-of-00007.safetensors",
126
- "block.16.attn.norm.scale": "model--00002-of-00007.safetensors",
127
- "block.16.attn.out.bias": "model--00002-of-00007.safetensors",
128
- "block.16.attn.out.weight": "model--00002-of-00007.safetensors",
129
- "block.16.attn.qkv.bias": "model--00002-of-00007.safetensors",
130
- "block.16.attn.qkv.weight": "model--00002-of-00007.safetensors",
131
- "block.16.attn.sinks": "model--00002-of-00007.safetensors",
132
- "block.16.mlp.gate.bias": "model--00002-of-00007.safetensors",
133
- "block.16.mlp.gate.weight": "model--00002-of-00007.safetensors",
134
- "block.16.mlp.mlp1_bias": "model--00002-of-00007.safetensors",
135
- "block.16.mlp.mlp1_weight.blocks": "model--00002-of-00007.safetensors",
136
- "block.16.mlp.mlp1_weight.scales": "model--00002-of-00007.safetensors",
137
- "block.16.mlp.mlp2_bias": "model--00002-of-00007.safetensors",
138
- "block.16.mlp.mlp2_weight.blocks": "model--00002-of-00007.safetensors",
139
- "block.16.mlp.mlp2_weight.scales": "model--00002-of-00007.safetensors",
140
- "block.16.mlp.norm.scale": "model--00002-of-00007.safetensors",
141
- "block.17.attn.norm.scale": "model--00002-of-00007.safetensors",
142
- "block.17.attn.out.bias": "model--00002-of-00007.safetensors",
143
- "block.17.attn.out.weight": "model--00002-of-00007.safetensors",
144
- "block.17.attn.qkv.bias": "model--00002-of-00007.safetensors",
145
- "block.17.attn.qkv.weight": "model--00002-of-00007.safetensors",
146
- "block.17.attn.sinks": "model--00002-of-00007.safetensors",
147
- "block.17.mlp.gate.bias": "model--00002-of-00007.safetensors",
148
- "block.17.mlp.gate.weight": "model--00002-of-00007.safetensors",
149
- "block.17.mlp.mlp1_bias": "model--00002-of-00007.safetensors",
150
- "block.17.mlp.mlp1_weight.blocks": "model--00002-of-00007.safetensors",
151
- "block.17.mlp.mlp1_weight.scales": "model--00002-of-00007.safetensors",
152
- "block.17.mlp.mlp2_bias": "model--00002-of-00007.safetensors",
153
- "block.17.mlp.mlp2_weight.blocks": "model--00002-of-00007.safetensors",
154
- "block.17.mlp.mlp2_weight.scales": "model--00002-of-00007.safetensors",
155
- "block.17.mlp.norm.scale": "model--00002-of-00007.safetensors",
156
- "block.18.attn.norm.scale": "model--00002-of-00007.safetensors",
157
- "block.18.attn.out.bias": "model--00002-of-00007.safetensors",
158
- "block.18.attn.out.weight": "model--00002-of-00007.safetensors",
159
- "block.18.attn.qkv.bias": "model--00002-of-00007.safetensors",
160
- "block.18.attn.qkv.weight": "model--00002-of-00007.safetensors",
161
- "block.18.attn.sinks": "model--00002-of-00007.safetensors",
162
- "block.18.mlp.gate.bias": "model--00002-of-00007.safetensors",
163
- "block.18.mlp.gate.weight": "model--00002-of-00007.safetensors",
164
- "block.18.mlp.mlp1_bias": "model--00002-of-00007.safetensors",
165
- "block.18.mlp.mlp1_weight.blocks": "model--00002-of-00007.safetensors",
166
- "block.18.mlp.mlp1_weight.scales": "model--00002-of-00007.safetensors",
167
- "block.18.mlp.mlp2_bias": "model--00002-of-00007.safetensors",
168
- "block.18.mlp.mlp2_weight.blocks": "model--00002-of-00007.safetensors",
169
- "block.18.mlp.mlp2_weight.scales": "model--00002-of-00007.safetensors",
170
- "block.18.mlp.norm.scale": "model--00002-of-00007.safetensors",
171
- "block.19.attn.norm.scale": "model--00002-of-00007.safetensors",
172
- "block.19.attn.out.bias": "model--00002-of-00007.safetensors",
173
- "block.19.attn.out.weight": "model--00002-of-00007.safetensors",
174
- "block.19.attn.qkv.bias": "model--00002-of-00007.safetensors",
175
- "block.19.attn.qkv.weight": "model--00002-of-00007.safetensors",
176
- "block.19.attn.sinks": "model--00002-of-00007.safetensors",
177
- "block.19.mlp.gate.bias": "model--00002-of-00007.safetensors",
178
- "block.19.mlp.gate.weight": "model--00002-of-00007.safetensors",
179
- "block.19.mlp.mlp1_bias": "model--00002-of-00007.safetensors",
180
- "block.19.mlp.mlp1_weight.blocks": "model--00002-of-00007.safetensors",
181
- "block.19.mlp.mlp1_weight.scales": "model--00002-of-00007.safetensors",
182
- "block.19.mlp.mlp2_bias": "model--00002-of-00007.safetensors",
183
- "block.19.mlp.mlp2_weight.blocks": "model--00002-of-00007.safetensors",
184
- "block.19.mlp.mlp2_weight.scales": "model--00002-of-00007.safetensors",
185
- "block.19.mlp.norm.scale": "model--00002-of-00007.safetensors",
186
- "block.2.attn.norm.scale": "model--00002-of-00007.safetensors",
187
- "block.2.attn.out.bias": "model--00002-of-00007.safetensors",
188
- "block.2.attn.out.weight": "model--00002-of-00007.safetensors",
189
- "block.2.attn.qkv.bias": "model--00002-of-00007.safetensors",
190
- "block.2.attn.qkv.weight": "model--00002-of-00007.safetensors",
191
- "block.2.attn.sinks": "model--00002-of-00007.safetensors",
192
- "block.2.mlp.gate.bias": "model--00002-of-00007.safetensors",
193
- "block.2.mlp.gate.weight": "model--00002-of-00007.safetensors",
194
- "block.2.mlp.mlp1_bias": "model--00002-of-00007.safetensors",
195
- "block.2.mlp.mlp1_weight.blocks": "model--00003-of-00007.safetensors",
196
- "block.2.mlp.mlp1_weight.scales": "model--00003-of-00007.safetensors",
197
- "block.2.mlp.mlp2_bias": "model--00003-of-00007.safetensors",
198
- "block.2.mlp.mlp2_weight.blocks": "model--00003-of-00007.safetensors",
199
- "block.2.mlp.mlp2_weight.scales": "model--00003-of-00007.safetensors",
200
- "block.2.mlp.norm.scale": "model--00003-of-00007.safetensors",
201
- "block.20.attn.norm.scale": "model--00003-of-00007.safetensors",
202
- "block.20.attn.out.bias": "model--00003-of-00007.safetensors",
203
- "block.20.attn.out.weight": "model--00003-of-00007.safetensors",
204
- "block.20.attn.qkv.bias": "model--00003-of-00007.safetensors",
205
- "block.20.attn.qkv.weight": "model--00003-of-00007.safetensors",
206
- "block.20.attn.sinks": "model--00003-of-00007.safetensors",
207
- "block.20.mlp.gate.bias": "model--00003-of-00007.safetensors",
208
- "block.20.mlp.gate.weight": "model--00003-of-00007.safetensors",
209
- "block.20.mlp.mlp1_bias": "model--00003-of-00007.safetensors",
210
- "block.20.mlp.mlp1_weight.blocks": "model--00003-of-00007.safetensors",
211
- "block.20.mlp.mlp1_weight.scales": "model--00003-of-00007.safetensors",
212
- "block.20.mlp.mlp2_bias": "model--00003-of-00007.safetensors",
213
- "block.20.mlp.mlp2_weight.blocks": "model--00003-of-00007.safetensors",
214
- "block.20.mlp.mlp2_weight.scales": "model--00003-of-00007.safetensors",
215
- "block.20.mlp.norm.scale": "model--00003-of-00007.safetensors",
216
- "block.21.attn.norm.scale": "model--00003-of-00007.safetensors",
217
- "block.21.attn.out.bias": "model--00003-of-00007.safetensors",
218
- "block.21.attn.out.weight": "model--00003-of-00007.safetensors",
219
- "block.21.attn.qkv.bias": "model--00003-of-00007.safetensors",
220
- "block.21.attn.qkv.weight": "model--00003-of-00007.safetensors",
221
- "block.21.attn.sinks": "model--00003-of-00007.safetensors",
222
- "block.21.mlp.gate.bias": "model--00003-of-00007.safetensors",
223
- "block.21.mlp.gate.weight": "model--00003-of-00007.safetensors",
224
- "block.21.mlp.mlp1_bias": "model--00003-of-00007.safetensors",
225
- "block.21.mlp.mlp1_weight.blocks": "model--00003-of-00007.safetensors",
226
- "block.21.mlp.mlp1_weight.scales": "model--00003-of-00007.safetensors",
227
- "block.21.mlp.mlp2_bias": "model--00003-of-00007.safetensors",
228
- "block.21.mlp.mlp2_weight.blocks": "model--00003-of-00007.safetensors",
229
- "block.21.mlp.mlp2_weight.scales": "model--00003-of-00007.safetensors",
230
- "block.21.mlp.norm.scale": "model--00003-of-00007.safetensors",
231
- "block.22.attn.norm.scale": "model--00003-of-00007.safetensors",
232
- "block.22.attn.out.bias": "model--00003-of-00007.safetensors",
233
- "block.22.attn.out.weight": "model--00003-of-00007.safetensors",
234
- "block.22.attn.qkv.bias": "model--00003-of-00007.safetensors",
235
- "block.22.attn.qkv.weight": "model--00003-of-00007.safetensors",
236
- "block.22.attn.sinks": "model--00003-of-00007.safetensors",
237
- "block.22.mlp.gate.bias": "model--00003-of-00007.safetensors",
238
- "block.22.mlp.gate.weight": "model--00003-of-00007.safetensors",
239
- "block.22.mlp.mlp1_bias": "model--00003-of-00007.safetensors",
240
- "block.22.mlp.mlp1_weight.blocks": "model--00003-of-00007.safetensors",
241
- "block.22.mlp.mlp1_weight.scales": "model--00003-of-00007.safetensors",
242
- "block.22.mlp.mlp2_bias": "model--00003-of-00007.safetensors",
243
- "block.22.mlp.mlp2_weight.blocks": "model--00003-of-00007.safetensors",
244
- "block.22.mlp.mlp2_weight.scales": "model--00003-of-00007.safetensors",
245
- "block.22.mlp.norm.scale": "model--00003-of-00007.safetensors",
246
- "block.23.attn.norm.scale": "model--00003-of-00007.safetensors",
247
- "block.23.attn.out.bias": "model--00003-of-00007.safetensors",
248
- "block.23.attn.out.weight": "model--00003-of-00007.safetensors",
249
- "block.23.attn.qkv.bias": "model--00003-of-00007.safetensors",
250
- "block.23.attn.qkv.weight": "model--00003-of-00007.safetensors",
251
- "block.23.attn.sinks": "model--00003-of-00007.safetensors",
252
- "block.23.mlp.gate.bias": "model--00003-of-00007.safetensors",
253
- "block.23.mlp.gate.weight": "model--00003-of-00007.safetensors",
254
- "block.23.mlp.mlp1_bias": "model--00003-of-00007.safetensors",
255
- "block.23.mlp.mlp1_weight.blocks": "model--00003-of-00007.safetensors",
256
- "block.23.mlp.mlp1_weight.scales": "model--00003-of-00007.safetensors",
257
- "block.23.mlp.mlp2_bias": "model--00003-of-00007.safetensors",
258
- "block.23.mlp.mlp2_weight.blocks": "model--00003-of-00007.safetensors",
259
- "block.23.mlp.mlp2_weight.scales": "model--00003-of-00007.safetensors",
260
- "block.23.mlp.norm.scale": "model--00003-of-00007.safetensors",
261
- "block.24.attn.norm.scale": "model--00003-of-00007.safetensors",
262
- "block.24.attn.out.bias": "model--00003-of-00007.safetensors",
263
- "block.24.attn.out.weight": "model--00003-of-00007.safetensors",
264
- "block.24.attn.qkv.bias": "model--00003-of-00007.safetensors",
265
- "block.24.attn.qkv.weight": "model--00003-of-00007.safetensors",
266
- "block.24.attn.sinks": "model--00003-of-00007.safetensors",
267
- "block.24.mlp.gate.bias": "model--00003-of-00007.safetensors",
268
- "block.24.mlp.gate.weight": "model--00003-of-00007.safetensors",
269
- "block.24.mlp.mlp1_bias": "model--00003-of-00007.safetensors",
270
- "block.24.mlp.mlp1_weight.blocks": "model--00003-of-00007.safetensors",
271
- "block.24.mlp.mlp1_weight.scales": "model--00003-of-00007.safetensors",
272
- "block.24.mlp.mlp2_bias": "model--00003-of-00007.safetensors",
273
- "block.24.mlp.mlp2_weight.blocks": "model--00003-of-00007.safetensors",
274
- "block.24.mlp.mlp2_weight.scales": "model--00003-of-00007.safetensors",
275
- "block.24.mlp.norm.scale": "model--00003-of-00007.safetensors",
276
- "block.25.attn.norm.scale": "model--00003-of-00007.safetensors",
277
- "block.25.attn.out.bias": "model--00003-of-00007.safetensors",
278
- "block.25.attn.out.weight": "model--00003-of-00007.safetensors",
279
- "block.25.attn.qkv.bias": "model--00003-of-00007.safetensors",
280
- "block.25.attn.qkv.weight": "model--00003-of-00007.safetensors",
281
- "block.25.attn.sinks": "model--00003-of-00007.safetensors",
282
- "block.25.mlp.gate.bias": "model--00003-of-00007.safetensors",
283
- "block.25.mlp.gate.weight": "model--00003-of-00007.safetensors",
284
- "block.25.mlp.mlp1_bias": "model--00003-of-00007.safetensors",
285
- "block.25.mlp.mlp1_weight.blocks": "model--00004-of-00007.safetensors",
286
- "block.25.mlp.mlp1_weight.scales": "model--00004-of-00007.safetensors",
287
- "block.25.mlp.mlp2_bias": "model--00004-of-00007.safetensors",
288
- "block.25.mlp.mlp2_weight.blocks": "model--00004-of-00007.safetensors",
289
- "block.25.mlp.mlp2_weight.scales": "model--00004-of-00007.safetensors",
290
- "block.25.mlp.norm.scale": "model--00004-of-00007.safetensors",
291
- "block.26.attn.norm.scale": "model--00004-of-00007.safetensors",
292
- "block.26.attn.out.bias": "model--00004-of-00007.safetensors",
293
- "block.26.attn.out.weight": "model--00004-of-00007.safetensors",
294
- "block.26.attn.qkv.bias": "model--00004-of-00007.safetensors",
295
- "block.26.attn.qkv.weight": "model--00004-of-00007.safetensors",
296
- "block.26.attn.sinks": "model--00004-of-00007.safetensors",
297
- "block.26.mlp.gate.bias": "model--00004-of-00007.safetensors",
298
- "block.26.mlp.gate.weight": "model--00004-of-00007.safetensors",
299
- "block.26.mlp.mlp1_bias": "model--00004-of-00007.safetensors",
300
- "block.26.mlp.mlp1_weight.blocks": "model--00004-of-00007.safetensors",
301
- "block.26.mlp.mlp1_weight.scales": "model--00004-of-00007.safetensors",
302
- "block.26.mlp.mlp2_bias": "model--00004-of-00007.safetensors",
303
- "block.26.mlp.mlp2_weight.blocks": "model--00004-of-00007.safetensors",
304
- "block.26.mlp.mlp2_weight.scales": "model--00004-of-00007.safetensors",
305
- "block.26.mlp.norm.scale": "model--00004-of-00007.safetensors",
306
- "block.27.attn.norm.scale": "model--00004-of-00007.safetensors",
307
- "block.27.attn.out.bias": "model--00004-of-00007.safetensors",
308
- "block.27.attn.out.weight": "model--00004-of-00007.safetensors",
309
- "block.27.attn.qkv.bias": "model--00004-of-00007.safetensors",
310
- "block.27.attn.qkv.weight": "model--00004-of-00007.safetensors",
311
- "block.27.attn.sinks": "model--00004-of-00007.safetensors",
312
- "block.27.mlp.gate.bias": "model--00004-of-00007.safetensors",
313
- "block.27.mlp.gate.weight": "model--00004-of-00007.safetensors",
314
- "block.27.mlp.mlp1_bias": "model--00004-of-00007.safetensors",
315
- "block.27.mlp.mlp1_weight.blocks": "model--00004-of-00007.safetensors",
316
- "block.27.mlp.mlp1_weight.scales": "model--00004-of-00007.safetensors",
317
- "block.27.mlp.mlp2_bias": "model--00004-of-00007.safetensors",
318
- "block.27.mlp.mlp2_weight.blocks": "model--00004-of-00007.safetensors",
319
- "block.27.mlp.mlp2_weight.scales": "model--00004-of-00007.safetensors",
320
- "block.27.mlp.norm.scale": "model--00004-of-00007.safetensors",
321
- "block.28.attn.norm.scale": "model--00004-of-00007.safetensors",
322
- "block.28.attn.out.bias": "model--00004-of-00007.safetensors",
323
- "block.28.attn.out.weight": "model--00004-of-00007.safetensors",
324
- "block.28.attn.qkv.bias": "model--00004-of-00007.safetensors",
325
- "block.28.attn.qkv.weight": "model--00004-of-00007.safetensors",
326
- "block.28.attn.sinks": "model--00004-of-00007.safetensors",
327
- "block.28.mlp.gate.bias": "model--00004-of-00007.safetensors",
328
- "block.28.mlp.gate.weight": "model--00004-of-00007.safetensors",
329
- "block.28.mlp.mlp1_bias": "model--00004-of-00007.safetensors",
330
- "block.28.mlp.mlp1_weight.blocks": "model--00004-of-00007.safetensors",
331
- "block.28.mlp.mlp1_weight.scales": "model--00004-of-00007.safetensors",
332
- "block.28.mlp.mlp2_bias": "model--00004-of-00007.safetensors",
333
- "block.28.mlp.mlp2_weight.blocks": "model--00004-of-00007.safetensors",
334
- "block.28.mlp.mlp2_weight.scales": "model--00004-of-00007.safetensors",
335
- "block.28.mlp.norm.scale": "model--00004-of-00007.safetensors",
336
- "block.29.attn.norm.scale": "model--00004-of-00007.safetensors",
337
- "block.29.attn.out.bias": "model--00004-of-00007.safetensors",
338
- "block.29.attn.out.weight": "model--00004-of-00007.safetensors",
339
- "block.29.attn.qkv.bias": "model--00004-of-00007.safetensors",
340
- "block.29.attn.qkv.weight": "model--00004-of-00007.safetensors",
341
- "block.29.attn.sinks": "model--00004-of-00007.safetensors",
342
- "block.29.mlp.gate.bias": "model--00004-of-00007.safetensors",
343
- "block.29.mlp.gate.weight": "model--00004-of-00007.safetensors",
344
- "block.29.mlp.mlp1_bias": "model--00004-of-00007.safetensors",
345
- "block.29.mlp.mlp1_weight.blocks": "model--00004-of-00007.safetensors",
346
- "block.29.mlp.mlp1_weight.scales": "model--00004-of-00007.safetensors",
347
- "block.29.mlp.mlp2_bias": "model--00004-of-00007.safetensors",
348
- "block.29.mlp.mlp2_weight.blocks": "model--00004-of-00007.safetensors",
349
- "block.29.mlp.mlp2_weight.scales": "model--00004-of-00007.safetensors",
350
- "block.29.mlp.norm.scale": "model--00004-of-00007.safetensors",
351
- "block.3.attn.norm.scale": "model--00004-of-00007.safetensors",
352
- "block.3.attn.out.bias": "model--00004-of-00007.safetensors",
353
- "block.3.attn.out.weight": "model--00004-of-00007.safetensors",
354
- "block.3.attn.qkv.bias": "model--00004-of-00007.safetensors",
355
- "block.3.attn.qkv.weight": "model--00004-of-00007.safetensors",
356
- "block.3.attn.sinks": "model--00004-of-00007.safetensors",
357
- "block.3.mlp.gate.bias": "model--00004-of-00007.safetensors",
358
- "block.3.mlp.gate.weight": "model--00004-of-00007.safetensors",
359
- "block.3.mlp.mlp1_bias": "model--00004-of-00007.safetensors",
360
- "block.3.mlp.mlp1_weight.blocks": "model--00004-of-00007.safetensors",
361
- "block.3.mlp.mlp1_weight.scales": "model--00004-of-00007.safetensors",
362
- "block.3.mlp.mlp2_bias": "model--00004-of-00007.safetensors",
363
- "block.3.mlp.mlp2_weight.blocks": "model--00004-of-00007.safetensors",
364
- "block.3.mlp.mlp2_weight.scales": "model--00004-of-00007.safetensors",
365
- "block.3.mlp.norm.scale": "model--00004-of-00007.safetensors",
366
- "block.30.attn.norm.scale": "model--00004-of-00007.safetensors",
367
- "block.30.attn.out.bias": "model--00004-of-00007.safetensors",
368
- "block.30.attn.out.weight": "model--00004-of-00007.safetensors",
369
- "block.30.attn.qkv.bias": "model--00004-of-00007.safetensors",
370
- "block.30.attn.qkv.weight": "model--00004-of-00007.safetensors",
371
- "block.30.attn.sinks": "model--00004-of-00007.safetensors",
372
- "block.30.mlp.gate.bias": "model--00004-of-00007.safetensors",
373
- "block.30.mlp.gate.weight": "model--00004-of-00007.safetensors",
374
- "block.30.mlp.mlp1_bias": "model--00004-of-00007.safetensors",
375
- "block.30.mlp.mlp1_weight.blocks": "model--00005-of-00007.safetensors",
376
- "block.30.mlp.mlp1_weight.scales": "model--00005-of-00007.safetensors",
377
- "block.30.mlp.mlp2_bias": "model--00005-of-00007.safetensors",
378
- "block.30.mlp.mlp2_weight.blocks": "model--00005-of-00007.safetensors",
379
- "block.30.mlp.mlp2_weight.scales": "model--00005-of-00007.safetensors",
380
- "block.30.mlp.norm.scale": "model--00005-of-00007.safetensors",
381
- "block.31.attn.norm.scale": "model--00005-of-00007.safetensors",
382
- "block.31.attn.out.bias": "model--00005-of-00007.safetensors",
383
- "block.31.attn.out.weight": "model--00005-of-00007.safetensors",
384
- "block.31.attn.qkv.bias": "model--00005-of-00007.safetensors",
385
- "block.31.attn.qkv.weight": "model--00005-of-00007.safetensors",
386
- "block.31.attn.sinks": "model--00005-of-00007.safetensors",
387
- "block.31.mlp.gate.bias": "model--00005-of-00007.safetensors",
388
- "block.31.mlp.gate.weight": "model--00005-of-00007.safetensors",
389
- "block.31.mlp.mlp1_bias": "model--00005-of-00007.safetensors",
390
- "block.31.mlp.mlp1_weight.blocks": "model--00005-of-00007.safetensors",
391
- "block.31.mlp.mlp1_weight.scales": "model--00005-of-00007.safetensors",
392
- "block.31.mlp.mlp2_bias": "model--00005-of-00007.safetensors",
393
- "block.31.mlp.mlp2_weight.blocks": "model--00005-of-00007.safetensors",
394
- "block.31.mlp.mlp2_weight.scales": "model--00005-of-00007.safetensors",
395
- "block.31.mlp.norm.scale": "model--00005-of-00007.safetensors",
396
- "block.32.attn.norm.scale": "model--00005-of-00007.safetensors",
397
- "block.32.attn.out.bias": "model--00005-of-00007.safetensors",
398
- "block.32.attn.out.weight": "model--00005-of-00007.safetensors",
399
- "block.32.attn.qkv.bias": "model--00005-of-00007.safetensors",
400
- "block.32.attn.qkv.weight": "model--00005-of-00007.safetensors",
401
- "block.32.attn.sinks": "model--00005-of-00007.safetensors",
402
- "block.32.mlp.gate.bias": "model--00005-of-00007.safetensors",
403
- "block.32.mlp.gate.weight": "model--00005-of-00007.safetensors",
404
- "block.32.mlp.mlp1_bias": "model--00005-of-00007.safetensors",
405
- "block.32.mlp.mlp1_weight.blocks": "model--00005-of-00007.safetensors",
406
- "block.32.mlp.mlp1_weight.scales": "model--00005-of-00007.safetensors",
407
- "block.32.mlp.mlp2_bias": "model--00005-of-00007.safetensors",
408
- "block.32.mlp.mlp2_weight.blocks": "model--00005-of-00007.safetensors",
409
- "block.32.mlp.mlp2_weight.scales": "model--00005-of-00007.safetensors",
410
- "block.32.mlp.norm.scale": "model--00005-of-00007.safetensors",
411
- "block.33.attn.norm.scale": "model--00005-of-00007.safetensors",
412
- "block.33.attn.out.bias": "model--00005-of-00007.safetensors",
413
- "block.33.attn.out.weight": "model--00005-of-00007.safetensors",
414
- "block.33.attn.qkv.bias": "model--00005-of-00007.safetensors",
415
- "block.33.attn.qkv.weight": "model--00005-of-00007.safetensors",
416
- "block.33.attn.sinks": "model--00005-of-00007.safetensors",
417
- "block.33.mlp.gate.bias": "model--00005-of-00007.safetensors",
418
- "block.33.mlp.gate.weight": "model--00005-of-00007.safetensors",
419
- "block.33.mlp.mlp1_bias": "model--00005-of-00007.safetensors",
420
- "block.33.mlp.mlp1_weight.blocks": "model--00005-of-00007.safetensors",
421
- "block.33.mlp.mlp1_weight.scales": "model--00005-of-00007.safetensors",
422
- "block.33.mlp.mlp2_bias": "model--00005-of-00007.safetensors",
423
- "block.33.mlp.mlp2_weight.blocks": "model--00005-of-00007.safetensors",
424
- "block.33.mlp.mlp2_weight.scales": "model--00005-of-00007.safetensors",
425
- "block.33.mlp.norm.scale": "model--00005-of-00007.safetensors",
426
- "block.34.attn.norm.scale": "model--00005-of-00007.safetensors",
427
- "block.34.attn.out.bias": "model--00005-of-00007.safetensors",
428
- "block.34.attn.out.weight": "model--00005-of-00007.safetensors",
429
- "block.34.attn.qkv.bias": "model--00005-of-00007.safetensors",
430
- "block.34.attn.qkv.weight": "model--00005-of-00007.safetensors",
431
- "block.34.attn.sinks": "model--00005-of-00007.safetensors",
432
- "block.34.mlp.gate.bias": "model--00005-of-00007.safetensors",
433
- "block.34.mlp.gate.weight": "model--00005-of-00007.safetensors",
434
- "block.34.mlp.mlp1_bias": "model--00005-of-00007.safetensors",
435
- "block.34.mlp.mlp1_weight.blocks": "model--00005-of-00007.safetensors",
436
- "block.34.mlp.mlp1_weight.scales": "model--00005-of-00007.safetensors",
437
- "block.34.mlp.mlp2_bias": "model--00005-of-00007.safetensors",
438
- "block.34.mlp.mlp2_weight.blocks": "model--00005-of-00007.safetensors",
439
- "block.34.mlp.mlp2_weight.scales": "model--00005-of-00007.safetensors",
440
- "block.34.mlp.norm.scale": "model--00005-of-00007.safetensors",
441
- "block.35.attn.norm.scale": "model--00005-of-00007.safetensors",
442
- "block.35.attn.out.bias": "model--00005-of-00007.safetensors",
443
- "block.35.attn.out.weight": "model--00005-of-00007.safetensors",
444
- "block.35.attn.qkv.bias": "model--00005-of-00007.safetensors",
445
- "block.35.attn.qkv.weight": "model--00005-of-00007.safetensors",
446
- "block.35.attn.sinks": "model--00005-of-00007.safetensors",
447
- "block.35.mlp.gate.bias": "model--00005-of-00007.safetensors",
448
- "block.35.mlp.gate.weight": "model--00005-of-00007.safetensors",
449
- "block.35.mlp.mlp1_bias": "model--00005-of-00007.safetensors",
450
- "block.35.mlp.mlp1_weight.blocks": "model--00005-of-00007.safetensors",
451
- "block.35.mlp.mlp1_weight.scales": "model--00005-of-00007.safetensors",
452
- "block.35.mlp.mlp2_bias": "model--00005-of-00007.safetensors",
453
- "block.35.mlp.mlp2_weight.blocks": "model--00005-of-00007.safetensors",
454
- "block.35.mlp.mlp2_weight.scales": "model--00005-of-00007.safetensors",
455
- "block.35.mlp.norm.scale": "model--00005-of-00007.safetensors",
456
- "block.4.attn.norm.scale": "model--00005-of-00007.safetensors",
457
- "block.4.attn.out.bias": "model--00005-of-00007.safetensors",
458
- "block.4.attn.out.weight": "model--00005-of-00007.safetensors",
459
- "block.4.attn.qkv.bias": "model--00005-of-00007.safetensors",
460
- "block.4.attn.qkv.weight": "model--00005-of-00007.safetensors",
461
- "block.4.attn.sinks": "model--00005-of-00007.safetensors",
462
- "block.4.mlp.gate.bias": "model--00005-of-00007.safetensors",
463
- "block.4.mlp.gate.weight": "model--00005-of-00007.safetensors",
464
- "block.4.mlp.mlp1_bias": "model--00005-of-00007.safetensors",
465
- "block.4.mlp.mlp1_weight.blocks": "model--00006-of-00007.safetensors",
466
- "block.4.mlp.mlp1_weight.scales": "model--00006-of-00007.safetensors",
467
- "block.4.mlp.mlp2_bias": "model--00006-of-00007.safetensors",
468
- "block.4.mlp.mlp2_weight.blocks": "model--00006-of-00007.safetensors",
469
- "block.4.mlp.mlp2_weight.scales": "model--00006-of-00007.safetensors",
470
- "block.4.mlp.norm.scale": "model--00006-of-00007.safetensors",
471
- "block.5.attn.norm.scale": "model--00006-of-00007.safetensors",
472
- "block.5.attn.out.bias": "model--00006-of-00007.safetensors",
473
- "block.5.attn.out.weight": "model--00006-of-00007.safetensors",
474
- "block.5.attn.qkv.bias": "model--00006-of-00007.safetensors",
475
- "block.5.attn.qkv.weight": "model--00006-of-00007.safetensors",
476
- "block.5.attn.sinks": "model--00006-of-00007.safetensors",
477
- "block.5.mlp.gate.bias": "model--00006-of-00007.safetensors",
478
- "block.5.mlp.gate.weight": "model--00006-of-00007.safetensors",
479
- "block.5.mlp.mlp1_bias": "model--00006-of-00007.safetensors",
480
- "block.5.mlp.mlp1_weight.blocks": "model--00006-of-00007.safetensors",
481
- "block.5.mlp.mlp1_weight.scales": "model--00006-of-00007.safetensors",
482
- "block.5.mlp.mlp2_bias": "model--00006-of-00007.safetensors",
483
- "block.5.mlp.mlp2_weight.blocks": "model--00006-of-00007.safetensors",
484
- "block.5.mlp.mlp2_weight.scales": "model--00006-of-00007.safetensors",
485
- "block.5.mlp.norm.scale": "model--00006-of-00007.safetensors",
486
- "block.6.attn.norm.scale": "model--00006-of-00007.safetensors",
487
- "block.6.attn.out.bias": "model--00006-of-00007.safetensors",
488
- "block.6.attn.out.weight": "model--00006-of-00007.safetensors",
489
- "block.6.attn.qkv.bias": "model--00006-of-00007.safetensors",
490
- "block.6.attn.qkv.weight": "model--00006-of-00007.safetensors",
491
- "block.6.attn.sinks": "model--00006-of-00007.safetensors",
492
- "block.6.mlp.gate.bias": "model--00006-of-00007.safetensors",
493
- "block.6.mlp.gate.weight": "model--00006-of-00007.safetensors",
494
- "block.6.mlp.mlp1_bias": "model--00006-of-00007.safetensors",
495
- "block.6.mlp.mlp1_weight.blocks": "model--00006-of-00007.safetensors",
496
- "block.6.mlp.mlp1_weight.scales": "model--00006-of-00007.safetensors",
497
- "block.6.mlp.mlp2_bias": "model--00006-of-00007.safetensors",
498
- "block.6.mlp.mlp2_weight.blocks": "model--00006-of-00007.safetensors",
499
- "block.6.mlp.mlp2_weight.scales": "model--00006-of-00007.safetensors",
500
- "block.6.mlp.norm.scale": "model--00006-of-00007.safetensors",
501
- "block.7.attn.norm.scale": "model--00006-of-00007.safetensors",
502
- "block.7.attn.out.bias": "model--00006-of-00007.safetensors",
503
- "block.7.attn.out.weight": "model--00006-of-00007.safetensors",
504
- "block.7.attn.qkv.bias": "model--00006-of-00007.safetensors",
505
- "block.7.attn.qkv.weight": "model--00006-of-00007.safetensors",
506
- "block.7.attn.sinks": "model--00006-of-00007.safetensors",
507
- "block.7.mlp.gate.bias": "model--00006-of-00007.safetensors",
508
- "block.7.mlp.gate.weight": "model--00006-of-00007.safetensors",
509
- "block.7.mlp.mlp1_bias": "model--00006-of-00007.safetensors",
510
- "block.7.mlp.mlp1_weight.blocks": "model--00006-of-00007.safetensors",
511
- "block.7.mlp.mlp1_weight.scales": "model--00006-of-00007.safetensors",
512
- "block.7.mlp.mlp2_bias": "model--00006-of-00007.safetensors",
513
- "block.7.mlp.mlp2_weight.blocks": "model--00006-of-00007.safetensors",
514
- "block.7.mlp.mlp2_weight.scales": "model--00006-of-00007.safetensors",
515
- "block.7.mlp.norm.scale": "model--00006-of-00007.safetensors",
516
- "block.8.attn.norm.scale": "model--00006-of-00007.safetensors",
517
- "block.8.attn.out.bias": "model--00006-of-00007.safetensors",
518
- "block.8.attn.out.weight": "model--00006-of-00007.safetensors",
519
- "block.8.attn.qkv.bias": "model--00006-of-00007.safetensors",
520
- "block.8.attn.qkv.weight": "model--00006-of-00007.safetensors",
521
- "block.8.attn.sinks": "model--00006-of-00007.safetensors",
522
- "block.8.mlp.gate.bias": "model--00006-of-00007.safetensors",
523
- "block.8.mlp.gate.weight": "model--00006-of-00007.safetensors",
524
- "block.8.mlp.mlp1_bias": "model--00006-of-00007.safetensors",
525
- "block.8.mlp.mlp1_weight.blocks": "model--00006-of-00007.safetensors",
526
- "block.8.mlp.mlp1_weight.scales": "model--00006-of-00007.safetensors",
527
- "block.8.mlp.mlp2_bias": "model--00006-of-00007.safetensors",
528
- "block.8.mlp.mlp2_weight.blocks": "model--00006-of-00007.safetensors",
529
- "block.8.mlp.mlp2_weight.scales": "model--00006-of-00007.safetensors",
530
- "block.8.mlp.norm.scale": "model--00006-of-00007.safetensors",
531
- "block.9.attn.norm.scale": "model--00006-of-00007.safetensors",
532
- "block.9.attn.out.bias": "model--00006-of-00007.safetensors",
533
- "block.9.attn.out.weight": "model--00006-of-00007.safetensors",
534
- "block.9.attn.qkv.bias": "model--00006-of-00007.safetensors",
535
- "block.9.attn.qkv.weight": "model--00006-of-00007.safetensors",
536
- "block.9.attn.sinks": "model--00006-of-00007.safetensors",
537
- "block.9.mlp.gate.bias": "model--00006-of-00007.safetensors",
538
- "block.9.mlp.gate.weight": "model--00006-of-00007.safetensors",
539
- "block.9.mlp.mlp1_bias": "model--00006-of-00007.safetensors",
540
- "block.9.mlp.mlp1_weight.blocks": "model--00006-of-00007.safetensors",
541
- "block.9.mlp.mlp1_weight.scales": "model--00006-of-00007.safetensors",
542
- "block.9.mlp.mlp2_bias": "model--00006-of-00007.safetensors",
543
- "block.9.mlp.mlp2_weight.blocks": "model--00006-of-00007.safetensors",
544
- "block.9.mlp.mlp2_weight.scales": "model--00006-of-00007.safetensors",
545
- "block.9.mlp.norm.scale": "model--00006-of-00007.safetensors",
546
- "embedding.weight": "model--00007-of-00007.safetensors",
547
- "norm.scale": "model--00007-of-00007.safetensors",
548
- "unembedding.weight": "model--00007-of-00007.safetensors"
549
- }
550
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
special_tokens_map.json CHANGED
@@ -1,5 +1,23 @@
1
  {
2
- "bos_token": "<|startoftext|>",
3
- "eos_token": "<|return|>",
4
- "pad_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|return|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|reserved_200017|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
  }
tokenizer_config.json CHANGED
@@ -177,7 +177,10 @@
177
  "input_ids",
178
  "attention_mask"
179
  ],
180
- "model_max_length": 1000000000000000019884624838656,
181
- "pad_token": "<|endoftext|>",
182
- "tokenizer_class": "PreTrainedTokenizerFast"
183
- }
 
 
 
 
177
  "input_ids",
178
  "attention_mask"
179
  ],
180
+ "model_max_length": 131072,
181
+ "pad_token": "<|reserved_200017|>",
182
+ "padding_side": "left",
183
+ "tokenizer_class": "PreTrainedTokenizerFast",
184
+ "unk_token": null,
185
+ "chat_template": "{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}\n{#-\n In addition to the normal inputs of `messages` and `tools`, this template also accepts the\n following kwargs:\n - \"builtin_tools\": A list, can contain \"browser\" and/or \"python\".\n - \"model_identity\": A string that optionally describes the model identity.\n - \"reasoning_effort\": A string that describes the reasoning effort, defaults to \"medium\".\n #}\n\n{#- Tool Definition Rendering ============================================== #}\n{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}\n {%- if param_spec.type == \"array\" -%}\n {%- if param_spec['items'] -%}\n {%- if param_spec['items']['type'] == \"string\" -%}\n {{- \"string[]\" }}\n {%- elif param_spec['items']['type'] == \"number\" -%}\n {{- \"number[]\" }}\n {%- elif param_spec['items']['type'] == \"integer\" -%}\n {{- \"number[]\" }}\n {%- elif param_spec['items']['type'] == \"boolean\" -%}\n {{- \"boolean[]\" }}\n {%- else -%}\n {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}\n {%- if inner_type == \"object | object\" or inner_type|length > 50 -%}\n {{- \"any[]\" }}\n {%- else -%}\n {{- inner_type + \"[]\" }}\n {%- endif -%}\n {%- endif -%}\n {%- if param_spec.nullable -%}\n {{- \" | null\" }}\n {%- endif -%}\n {%- else -%}\n {{- \"any[]\" }}\n {%- if param_spec.nullable -%}\n {{- \" | null\" }}\n {%- endif -%}\n {%- endif -%}\n {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}\n {#- Handle array of types like [\"object\", \"object\"] from Union[dict, list] #}\n {%- if param_spec.type | length > 1 -%}\n {{- param_spec.type | join(\" | \") }}\n {%- else -%}\n {{- param_spec.type[0] }}\n {%- endif -%}\n {%- elif param_spec.oneOf -%}\n {#- Handle oneOf schemas - check for complex unions and fallback to any #}\n {%- set has_object_variants = false -%}\n {%- for variant in param_spec.oneOf -%}\n {%- if variant.type == \"object\" -%}\n {%- set has_object_variants = true -%}\n {%- endif -%}\n {%- endfor -%}\n {%- if has_object_variants and param_spec.oneOf|length > 1 -%}\n {{- \"any\" }}\n {%- else -%}\n {%- for variant in param_spec.oneOf -%}\n {{- render_typescript_type(variant, required_params) -}}\n {%- if variant.description %}\n {{- \"// \" + variant.description }}\n {%- endif -%}\n {%- if variant.default is defined %}\n {{ \"// default: \" + variant.default|tojson }}\n {%- endif -%}\n {%- if not loop.last %}\n {{- \" | \" }}\n {% endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- elif param_spec.type == \"string\" -%}\n {%- if param_spec.enum -%}\n {{- '\"' + param_spec.enum|join('\" | \"') + '\"' -}}\n {%- else -%}\n {{- \"string\" }}\n {%- if param_spec.nullable %}\n {{- \" | null\" }}\n {%- endif -%}\n {%- endif -%}\n {%- elif param_spec.type == \"number\" -%}\n {{- \"number\" }}\n {%- elif param_spec.type == \"integer\" -%}\n {{- \"number\" }}\n {%- elif param_spec.type == \"boolean\" -%}\n {{- \"boolean\" }}\n\n {%- elif param_spec.type == \"object\" -%}\n {%- if param_spec.properties -%}\n {{- \"{\\n\" }}\n {%- for prop_name, prop_spec in param_spec.properties.items() -%}\n {{- prop_name -}}\n {%- if prop_name not in (param_spec.required or []) -%}\n {{- \"?\" }}\n {%- endif -%}\n {{- \": \" }}\n {{ render_typescript_type(prop_spec, param_spec.required or []) }}\n {%- if not loop.last -%}\n {{-\", \" }}\n {%- endif -%}\n {%- endfor -%}\n {{- \"}\" }}\n {%- else -%}\n {{- \"object\" }}\n {%- endif -%}\n {%- else -%}\n {{- \"any\" }}\n {%- endif -%}\n{%- endmacro -%}\n\n{%- macro render_tool_namespace(namespace_name, tools) -%}\n {{- \"## \" + namespace_name + \"\\n\\n\" }}\n {{- \"namespace \" + namespace_name + \" {\\n\\n\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- \"// \" + tool.description + \"\\n\" }}\n {{- \"type \"+ tool.name + \" = \" }}\n {%- if tool.parameters and tool.parameters.properties -%}\n {{- \"(_: \" }}\n {{- \"{\\n\" }}\n {%- for param_name, param_spec in tool.parameters.properties.items() %}\n {{- \"// \" + param_spec.description + \"\\n\" }}\n {{- param_name }}\n {%- if param_name not in (tool.parameters.required or []) -%}\n {{- \"?\" }}\n {%- endif -%}\n {{- \": \" }}\n {{- render_typescript_type(param_spec, tool.parameters.required or []) }}\n {%- if param_spec.default is defined -%}\n {%- if param_spec.enum %}\n {{- \", // default: \" + param_spec.default }}\n {%- elif param_spec.oneOf %}\n {{- \"// default: \" + param_spec.default }}\n {%- else %}\n {{- \", // default: \" + param_spec.default|tojson }}\n {%- endif -%}\n {%- endif -%}\n {%- if not loop.last %}\n {{- \",\\n\" }}\n {%- else %}\n {{- \"\\n\" }}\n {%- endif -%}\n {%- endfor %}\n {{- \"}) => any;\\n\\n\" }}\n {%- else -%}\n {{- \"() => any;\\n\\n\" }}\n {%- endif -%}\n {%- endfor %}\n {{- \"} // namespace \" + namespace_name }}\n{%- endmacro -%}\n\n{%- macro render_builtin_tools(browser_tool, python_tool) -%}\n {%- if browser_tool %}\n {{- \"## browser\\n\\n\" }}\n {{- \"// Tool for browsing.\\n\" }}\n {{- \"// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\\n\" }}\n {{- \"// Cite information from the tool using the following format:\\n\" }}\n {{- \"// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\\n\" }}\n {{- \"// Do not quote more than 10 words directly from the tool output.\\n\" }}\n {{- \"// sources=web (default: web)\\n\" }}\n {{- \"namespace browser {\\n\\n\" }}\n {{- \"// Searches for information related to `query` and displays `topn` results.\\n\" }}\n {{- \"type search = (_: {\\n\" }}\n {{- \"query: string,\\n\" }}\n {{- \"topn?: number, // default: 10\\n\" }}\n {{- \"source?: string,\\n\" }}\n {{- \"}) => any;\\n\\n\" }}\n {{- \"// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\\n\" }}\n {{- \"// Valid link ids are displayed with the formatting: `【{id}†.*】`.\\n\" }}\n {{- \"// If `cursor` is not provided, the most recent page is implied.\\n\" }}\n {{- \"// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\\n\" }}\n {{- \"// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\\n\" }}\n {{- \"// Use this function without `id` to scroll to a new location of an opened page.\\n\" }}\n {{- \"type open = (_: {\\n\" }}\n {{- \"id?: number | string, // default: -1\\n\" }}\n {{- \"cursor?: number, // default: -1\\n\" }}\n {{- \"loc?: number, // default: -1\\n\" }}\n {{- \"num_lines?: number, // default: -1\\n\" }}\n {{- \"view_source?: boolean, // default: false\\n\" }}\n {{- \"source?: string,\\n\" }}\n {{- \"}) => any;\\n\\n\" }}\n {{- \"// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\\n\" }}\n {{- \"type find = (_: {\\n\" }}\n {{- \"pattern: string,\\n\" }}\n {{- \"cursor?: number, // default: -1\\n\" }}\n {{- \"}) => any;\\n\\n\" }}\n {{- \"} // namespace browser\\n\\n\" }}\n {%- endif -%}\n\n {%- if python_tool %}\n {{- \"## python\\n\\n\" }}\n {{- \"Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\\n\\n\" }}\n {{- \"When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\\n\\n\" }}\n {%- endif -%}\n{%- endmacro -%}\n\n{#- System Message Construction ============================================ #}\n{%- macro build_system_message() -%}\n {%- if model_identity is not defined %}\n {{- \"You are ChatGPT, a large language model trained by OpenAI.\\n\" -}}\n {%- else %}\n {{- model_identity }}\n {%- endif %}\n {{- \"Knowledge cutoff: 2024-06\\n\" }}\n {{- \"Current date: \" + strftime_now(\"%Y-%m-%d\") + \"\\n\\n\" }}\n {%- if reasoning_effort is not defined %}\n {%- set reasoning_effort = \"medium\" %}\n {%- endif %}\n {{- \"Reasoning: \" + reasoning_effort + \"\\n\\n\" }}\n {%- if builtin_tools is defined %}\n {{- \"# Tools\\n\\n\" }}\n {%- set available_builtin_tools = namespace(browser=false, python=false) %}\n {%- for tool in builtin_tools %}\n {%- if tool == \"browser\" %}\n {%- set available_builtin_tools.browser = true %}\n {%- elif tool == \"python\" %}\n {%- set available_builtin_tools.python = true %}\n {%- endif %}\n {%- endfor %}\n {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}\n {%- endif -%}\n {{- \"# Valid channels: analysis, commentary, final. Channel must be included for every message.\" }}\n {%- if tools is defined -%}\n {{- \"\\nCalls to these tools must go to the commentary channel: 'functions'.\" }}\n {%- endif -%}\n{%- endmacro -%}\n\n{#- Main Template Logic ================================================= #}\n{#- Set defaults #}\n\n{#- Render system message #}\n{{- \"<|start|>system<|message|>\" }}\n{{- build_system_message() }}\n{{- \"<|end|>\" }}\n\n{#- Extract developer message #}\n{%- if messages[0].role == \"developer\" or messages[0].role == \"system\" %}\n {%- set developer_message = messages[0].content %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set developer_message = \"\" %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{#- Render developer message #}\n{%- if developer_message or tools %}\n {{- \"<|start|>developer<|message|>\" }}\n {%- if developer_message %}\n {{- \"# Instructions\\n\\n\" }}\n {{- developer_message }}\n {%- endif %}\n {%- if tools -%}\n {{- \"\\n\\n\" }}\n {{- \"# Tools\\n\\n\" }}\n {{- render_tool_namespace(\"functions\", tools) }}\n {%- endif -%}\n {{- \"<|end|>\" }}\n{%- endif %}\n\n{#- Render messages #}\n{%- set last_tool_call = namespace(name=none) %}\n{%- for message in loop_messages -%}\n {#- At this point only assistant/user/tool messages should remain #}\n {%- if message.role == 'assistant' -%}\n {%- if \"tool_calls\" in message %}\n {#- We assume max 1 tool call per message, and so we infer the tool call name #}\n {#- in \"tool\" messages from the most recent assistant tool call name #}\n {%- set tool_call = message.tool_calls[0] %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {%- if message.content %}\n {{- \"<|start|>assistant<|channel|>analysis<|message|>\" + message.content + \"<|end|>\" }}\n {%- endif %}\n {{- \"<|start|>assistant to=\" }}\n {{- \"functions.\" + tool_call.name + \"<|channel|>commentary json<|message|>\" }}\n {{- tool_call.arguments|tojson }}\n {{- \"<|call|>\" }}\n {%- set last_tool_call.name = tool_call.name %}\n {%- elif \"thinking\" in message and loop.last and not add_generation_prompt %}\n {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}\n {#- This is a situation that should only occur in training, never in inference. #}\n {{- \"<|start|>assistant<|channel|>analysis<|message|>\" + message.thinking + \"<|end|>\" }}\n {#- <|return|> indicates the end of generation, but <|end|> does not #}\n {#- <|return|> should never be an input to the model, but we include it as the final token #}\n {#- when training, so the model learns to emit it. #}\n {{- \"<|start|>assistant<|channel|>final<|message|>\" + message.content + \"<|return|>\" }}\n {%- set last_tool_call.name = none %}\n {%- elif \"thinking\" in message %}\n {#- CoT is dropped during all previous turns, so we never render it for inference #}\n {{- \"<|start|>assistant<|channel|>final<|message|>\" + message.content + \"<|end|>\" }}\n {%- set last_tool_call.name = none %}\n {%- elif loop.last and not add_generation_prompt %}\n {#- <|return|> indicates the end of generation, but <|end|> does not #}\n {#- <|return|> should never be an input to the model, but we include it as the final token #}\n {#- when training, so the model learns to emit it. #}\n {{- \"<|start|>assistant<|message|>\" + message.content + \"<|return|>\" }}\n {%- else %}\n {{- \"<|start|>assistant<|message|>\" + message.content + \"<|end|>\" }}\n {%- set last_tool_call.name = none %}\n {%- endif %}\n {%- elif message.role == 'tool' -%}\n {%- if last_tool_call.name is none %}\n {{- raise_exception(\"Message has tool role, but there was no previous assistant message with a tool call!\") }}\n {%- endif %}\n {{- \"<|start|>functions.\" + last_tool_call.name }}\n {{- \" to=assistant<|channel|>commentary<|message|>\" + message.content|tojson + \"<|end|>\" }}\n {%- else -%}\n {{- \"<|start|>user<|message|>\" + message.content + \"<|end|>\" }}\n {%- endif -%}\n{%- endfor -%}\n\n{#- Generation prompt #}\n{%- if add_generation_prompt -%}\n<|start|>assistant\n{%- endif -%}\n{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}"
186
+ }