justinjja commited on
Commit
7ac362f
·
verified ·
1 Parent(s): 05170e9

Add INT4-W4A16 checkpoint

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. added_tokens.json +38 -0
  3. chat_template.jinja +24 -0
  4. config.json +486 -0
  5. configuration_minimax_m1.py +152 -0
  6. generation_config.json +4 -0
  7. merges.txt +0 -0
  8. model-00001-of-00054.safetensors +3 -0
  9. model-00002-of-00054.safetensors +3 -0
  10. model-00003-of-00054.safetensors +3 -0
  11. model-00004-of-00054.safetensors +3 -0
  12. model-00005-of-00054.safetensors +3 -0
  13. model-00006-of-00054.safetensors +3 -0
  14. model-00007-of-00054.safetensors +3 -0
  15. model-00008-of-00054.safetensors +3 -0
  16. model-00009-of-00054.safetensors +3 -0
  17. model-00010-of-00054.safetensors +3 -0
  18. model-00011-of-00054.safetensors +3 -0
  19. model-00012-of-00054.safetensors +3 -0
  20. model-00013-of-00054.safetensors +3 -0
  21. model-00014-of-00054.safetensors +3 -0
  22. model-00015-of-00054.safetensors +3 -0
  23. model-00016-of-00054.safetensors +3 -0
  24. model-00017-of-00054.safetensors +3 -0
  25. model-00018-of-00054.safetensors +3 -0
  26. model-00019-of-00054.safetensors +3 -0
  27. model-00020-of-00054.safetensors +3 -0
  28. model-00021-of-00054.safetensors +3 -0
  29. model-00022-of-00054.safetensors +3 -0
  30. model-00023-of-00054.safetensors +3 -0
  31. model-00024-of-00054.safetensors +3 -0
  32. model-00025-of-00054.safetensors +3 -0
  33. model-00026-of-00054.safetensors +3 -0
  34. model-00027-of-00054.safetensors +3 -0
  35. model-00028-of-00054.safetensors +3 -0
  36. model-00029-of-00054.safetensors +3 -0
  37. model-00030-of-00054.safetensors +3 -0
  38. model-00031-of-00054.safetensors +3 -0
  39. model-00032-of-00054.safetensors +3 -0
  40. model-00033-of-00054.safetensors +3 -0
  41. model-00034-of-00054.safetensors +3 -0
  42. model-00035-of-00054.safetensors +3 -0
  43. model-00036-of-00054.safetensors +3 -0
  44. model-00037-of-00054.safetensors +3 -0
  45. model-00038-of-00054.safetensors +3 -0
  46. model-00039-of-00054.safetensors +3 -0
  47. model-00040-of-00054.safetensors +3 -0
  48. model-00041-of-00054.safetensors +3 -0
  49. model-00042-of-00054.safetensors +3 -0
  50. model-00043-of-00054.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<begin_of_document>": 200034,
3
+ "<beginning_of_sentence>": 200019,
4
+ "<code_interpreter>": 200023,
5
+ "<commit_after>": 200018,
6
+ "<commit_before>": 200016,
7
+ "<commit_msg>": 200017,
8
+ "<empty_output>": 200015,
9
+ "<end_of_document>": 200021,
10
+ "<end_of_image>": 200030,
11
+ "<end_of_sentence>": 200020,
12
+ "<end_of_speech>": 200028,
13
+ "<end_of_video>": 200032,
14
+ "<filename>": 200006,
15
+ "<fim_middle>": 200002,
16
+ "<fim_pad>": 200004,
17
+ "<fim_prefix>": 200001,
18
+ "<fim_suffix>": 200003,
19
+ "<function_call>": 200022,
20
+ "<gh_stars>": 200007,
21
+ "<image>": 200025,
22
+ "<issue_closed>": 200010,
23
+ "<issue_comment>": 200009,
24
+ "<issue_start>": 200008,
25
+ "<jupyter_code>": 200013,
26
+ "<jupyter_error>": 200035,
27
+ "<jupyter_output>": 200014,
28
+ "<jupyter_start>": 200011,
29
+ "<jupyter_text>": 200012,
30
+ "<pad>": 200000,
31
+ "<reponame>": 200005,
32
+ "<speech>": 200024,
33
+ "<start_of_image>": 200029,
34
+ "<start_of_speech>": 200027,
35
+ "<start_of_video>": 200031,
36
+ "<video>": 200026,
37
+ "<vision_pad>": 200033
38
+ }
chat_template.jinja ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ '<begin_of_document>' -}}{% set ns = namespace(system_prompt='') -%}{% for message in messages -%}{% if message['role'] == 'system' -%}{% set ns.system_prompt = ns.system_prompt + message['content'][0]['text'] -%}{% endif -%}{%- endfor -%}{% if ns.system_prompt != '' -%}{{ '<beginning_of_sentence>system ai_setting=assistant
2
+ ' + ns.system_prompt + '<end_of_sentence>
3
+ ' -}}{%- endif -%}{% if tools -%}{{ '<beginning_of_sentence>system tool_setting=tools
4
+ You are provided with these tools:
5
+ <tools>
6
+ ' -}}{% for tool in tools -%}{{ tool | tojson ~ '
7
+ ' -}}{%- endfor -%}{{ '</tools>
8
+
9
+ If you need to call tools, please respond with <tool_calls></tool_calls> XML tags, and provide tool-name and json-object of arguments, following the format below:
10
+ <tool_calls>
11
+ {''name'': <tool-name-1>, ''arguments'': <args-json-object-1>}
12
+ ...
13
+ </tool_calls><end_of_sentence>
14
+ ' -}}{%- endif -%}{% for message in messages -%}{% if message['role'] == 'user' -%}{{ '<beginning_of_sentence>user name=user
15
+ ' + message['content'][0]['text'] + '<end_of_sentence>
16
+ ' -}}{% elif message['role'] == 'assistant' -%}{{ '<beginning_of_sentence>ai name=assistant
17
+ ' -}}{% for content in message['content'] | selectattr('type', 'equalto', 'text') -%}{{ content['text'] -}}{%- endfor -%}{{ '<end_of_sentence>
18
+ ' -}}{% elif message['role'] == 'tool' -%}{{ '<beginning_of_sentence>tool name=tools
19
+ ' }} {%- for content in message['content'] -%}{{- 'tool name: ' + content['name'] + '
20
+ ' + 'tool result: ' + content['text'] + '
21
+
22
+ ' -}} {%- endfor -%}{{- '<end_of_sentence>
23
+ ' -}}{% endif -%}{%- endfor -%}{% if add_generation_prompt -%}{{ '<beginning_of_sentence>ai name=assistant
24
+ ' -}}{%- endif -%}
config.json ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MiniMaxM1ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "attn_type_list": [
7
+ 0,
8
+ 0,
9
+ 0,
10
+ 0,
11
+ 0,
12
+ 0,
13
+ 0,
14
+ 1,
15
+ 0,
16
+ 0,
17
+ 0,
18
+ 0,
19
+ 0,
20
+ 0,
21
+ 0,
22
+ 1,
23
+ 0,
24
+ 0,
25
+ 0,
26
+ 0,
27
+ 0,
28
+ 0,
29
+ 0,
30
+ 1,
31
+ 0,
32
+ 0,
33
+ 0,
34
+ 0,
35
+ 0,
36
+ 0,
37
+ 0,
38
+ 1,
39
+ 0,
40
+ 0,
41
+ 0,
42
+ 0,
43
+ 0,
44
+ 0,
45
+ 0,
46
+ 1,
47
+ 0,
48
+ 0,
49
+ 0,
50
+ 0,
51
+ 0,
52
+ 0,
53
+ 0,
54
+ 1,
55
+ 0,
56
+ 0,
57
+ 0,
58
+ 0,
59
+ 0,
60
+ 0,
61
+ 0,
62
+ 1,
63
+ 0,
64
+ 0,
65
+ 0,
66
+ 0,
67
+ 0,
68
+ 0,
69
+ 0,
70
+ 1,
71
+ 0,
72
+ 0,
73
+ 0,
74
+ 0,
75
+ 0,
76
+ 0,
77
+ 0,
78
+ 1,
79
+ 0,
80
+ 0,
81
+ 0,
82
+ 0,
83
+ 0,
84
+ 0,
85
+ 0,
86
+ 1
87
+ ],
88
+ "auto_map": {
89
+ "AutoConfig": "MiniMaxAI/MiniMax-M1-80k--configuration_minimax_m1.MiniMaxM1Config",
90
+ "AutoModelForCausalLM": "MiniMaxAI/MiniMax-M1-80k--modeling_minimax_m1.MiniMaxM1ForCausalLM"
91
+ },
92
+ "head_dim": 128,
93
+ "hidden_act": "silu",
94
+ "hidden_size": 6144,
95
+ "initializer_range": 0.02,
96
+ "intermediate_size": 9216,
97
+ "layernorm_full_attention_alpha": 3.5565588200778455,
98
+ "layernorm_full_attention_beta": 1.0,
99
+ "layernorm_linear_attention_alpha": 3.5565588200778455,
100
+ "layernorm_linear_attention_beta": 1.0,
101
+ "layernorm_mlp_alpha": 3.5565588200778455,
102
+ "layernorm_mlp_beta": 1.0,
103
+ "max_position_embeddings": 10240000,
104
+ "model_type": "MiniMaxM1",
105
+ "num_attention_heads": 64,
106
+ "num_experts_per_tok": 2,
107
+ "num_hidden_layers": 80,
108
+ "num_key_value_heads": 8,
109
+ "num_local_experts": 32,
110
+ "output_router_logits": false,
111
+ "postnorm": true,
112
+ "quantization_config": {
113
+ "config_groups": {
114
+ "group_0": {
115
+ "input_activations": null,
116
+ "output_activations": null,
117
+ "targets": [
118
+ "Linear"
119
+ ],
120
+ "weights": {
121
+ "actorder": null,
122
+ "block_structure": null,
123
+ "dynamic": false,
124
+ "group_size": 128,
125
+ "num_bits": 4,
126
+ "observer": "minmax",
127
+ "observer_kwargs": {},
128
+ "strategy": "group",
129
+ "symmetric": true,
130
+ "type": "int"
131
+ }
132
+ }
133
+ },
134
+ "format": "pack-quantized",
135
+ "global_compression_ratio": null,
136
+ "ignore": [
137
+ "model.layers.0.self_attn.out_proj",
138
+ "model.layers.0.self_attn.qkv_proj",
139
+ "model.layers.0.self_attn.output_gate",
140
+ "model.layers.0.block_sparse_moe.gate",
141
+ "model.layers.1.self_attn.out_proj",
142
+ "model.layers.1.self_attn.qkv_proj",
143
+ "model.layers.1.self_attn.output_gate",
144
+ "model.layers.1.block_sparse_moe.gate",
145
+ "model.layers.2.self_attn.out_proj",
146
+ "model.layers.2.self_attn.qkv_proj",
147
+ "model.layers.2.self_attn.output_gate",
148
+ "model.layers.2.block_sparse_moe.gate",
149
+ "model.layers.3.self_attn.out_proj",
150
+ "model.layers.3.self_attn.qkv_proj",
151
+ "model.layers.3.self_attn.output_gate",
152
+ "model.layers.3.block_sparse_moe.gate",
153
+ "model.layers.4.self_attn.out_proj",
154
+ "model.layers.4.self_attn.qkv_proj",
155
+ "model.layers.4.self_attn.output_gate",
156
+ "model.layers.4.block_sparse_moe.gate",
157
+ "model.layers.5.self_attn.out_proj",
158
+ "model.layers.5.self_attn.qkv_proj",
159
+ "model.layers.5.self_attn.output_gate",
160
+ "model.layers.5.block_sparse_moe.gate",
161
+ "model.layers.6.self_attn.out_proj",
162
+ "model.layers.6.self_attn.qkv_proj",
163
+ "model.layers.6.self_attn.output_gate",
164
+ "model.layers.6.block_sparse_moe.gate",
165
+ "model.layers.7.self_attn.q_proj",
166
+ "model.layers.7.self_attn.k_proj",
167
+ "model.layers.7.self_attn.v_proj",
168
+ "model.layers.7.self_attn.o_proj",
169
+ "model.layers.7.block_sparse_moe.gate",
170
+ "model.layers.8.self_attn.out_proj",
171
+ "model.layers.8.self_attn.qkv_proj",
172
+ "model.layers.8.self_attn.output_gate",
173
+ "model.layers.8.block_sparse_moe.gate",
174
+ "model.layers.9.self_attn.out_proj",
175
+ "model.layers.9.self_attn.qkv_proj",
176
+ "model.layers.9.self_attn.output_gate",
177
+ "model.layers.9.block_sparse_moe.gate",
178
+ "model.layers.10.self_attn.out_proj",
179
+ "model.layers.10.self_attn.qkv_proj",
180
+ "model.layers.10.self_attn.output_gate",
181
+ "model.layers.10.block_sparse_moe.gate",
182
+ "model.layers.11.self_attn.out_proj",
183
+ "model.layers.11.self_attn.qkv_proj",
184
+ "model.layers.11.self_attn.output_gate",
185
+ "model.layers.11.block_sparse_moe.gate",
186
+ "model.layers.12.self_attn.out_proj",
187
+ "model.layers.12.self_attn.qkv_proj",
188
+ "model.layers.12.self_attn.output_gate",
189
+ "model.layers.12.block_sparse_moe.gate",
190
+ "model.layers.13.self_attn.out_proj",
191
+ "model.layers.13.self_attn.qkv_proj",
192
+ "model.layers.13.self_attn.output_gate",
193
+ "model.layers.13.block_sparse_moe.gate",
194
+ "model.layers.14.self_attn.out_proj",
195
+ "model.layers.14.self_attn.qkv_proj",
196
+ "model.layers.14.self_attn.output_gate",
197
+ "model.layers.14.block_sparse_moe.gate",
198
+ "model.layers.15.self_attn.q_proj",
199
+ "model.layers.15.self_attn.k_proj",
200
+ "model.layers.15.self_attn.v_proj",
201
+ "model.layers.15.self_attn.o_proj",
202
+ "model.layers.15.block_sparse_moe.gate",
203
+ "model.layers.16.self_attn.out_proj",
204
+ "model.layers.16.self_attn.qkv_proj",
205
+ "model.layers.16.self_attn.output_gate",
206
+ "model.layers.16.block_sparse_moe.gate",
207
+ "model.layers.17.self_attn.out_proj",
208
+ "model.layers.17.self_attn.qkv_proj",
209
+ "model.layers.17.self_attn.output_gate",
210
+ "model.layers.17.block_sparse_moe.gate",
211
+ "model.layers.18.self_attn.out_proj",
212
+ "model.layers.18.self_attn.qkv_proj",
213
+ "model.layers.18.self_attn.output_gate",
214
+ "model.layers.18.block_sparse_moe.gate",
215
+ "model.layers.19.self_attn.out_proj",
216
+ "model.layers.19.self_attn.qkv_proj",
217
+ "model.layers.19.self_attn.output_gate",
218
+ "model.layers.19.block_sparse_moe.gate",
219
+ "model.layers.20.self_attn.out_proj",
220
+ "model.layers.20.self_attn.qkv_proj",
221
+ "model.layers.20.self_attn.output_gate",
222
+ "model.layers.20.block_sparse_moe.gate",
223
+ "model.layers.21.self_attn.out_proj",
224
+ "model.layers.21.self_attn.qkv_proj",
225
+ "model.layers.21.self_attn.output_gate",
226
+ "model.layers.21.block_sparse_moe.gate",
227
+ "model.layers.22.self_attn.out_proj",
228
+ "model.layers.22.self_attn.qkv_proj",
229
+ "model.layers.22.self_attn.output_gate",
230
+ "model.layers.22.block_sparse_moe.gate",
231
+ "model.layers.23.self_attn.q_proj",
232
+ "model.layers.23.self_attn.k_proj",
233
+ "model.layers.23.self_attn.v_proj",
234
+ "model.layers.23.self_attn.o_proj",
235
+ "model.layers.23.block_sparse_moe.gate",
236
+ "model.layers.24.self_attn.out_proj",
237
+ "model.layers.24.self_attn.qkv_proj",
238
+ "model.layers.24.self_attn.output_gate",
239
+ "model.layers.24.block_sparse_moe.gate",
240
+ "model.layers.25.self_attn.out_proj",
241
+ "model.layers.25.self_attn.qkv_proj",
242
+ "model.layers.25.self_attn.output_gate",
243
+ "model.layers.25.block_sparse_moe.gate",
244
+ "model.layers.26.self_attn.out_proj",
245
+ "model.layers.26.self_attn.qkv_proj",
246
+ "model.layers.26.self_attn.output_gate",
247
+ "model.layers.26.block_sparse_moe.gate",
248
+ "model.layers.27.self_attn.out_proj",
249
+ "model.layers.27.self_attn.qkv_proj",
250
+ "model.layers.27.self_attn.output_gate",
251
+ "model.layers.27.block_sparse_moe.gate",
252
+ "model.layers.28.self_attn.out_proj",
253
+ "model.layers.28.self_attn.qkv_proj",
254
+ "model.layers.28.self_attn.output_gate",
255
+ "model.layers.28.block_sparse_moe.gate",
256
+ "model.layers.29.self_attn.out_proj",
257
+ "model.layers.29.self_attn.qkv_proj",
258
+ "model.layers.29.self_attn.output_gate",
259
+ "model.layers.29.block_sparse_moe.gate",
260
+ "model.layers.30.self_attn.out_proj",
261
+ "model.layers.30.self_attn.qkv_proj",
262
+ "model.layers.30.self_attn.output_gate",
263
+ "model.layers.30.block_sparse_moe.gate",
264
+ "model.layers.31.self_attn.q_proj",
265
+ "model.layers.31.self_attn.k_proj",
266
+ "model.layers.31.self_attn.v_proj",
267
+ "model.layers.31.self_attn.o_proj",
268
+ "model.layers.31.block_sparse_moe.gate",
269
+ "model.layers.32.self_attn.out_proj",
270
+ "model.layers.32.self_attn.qkv_proj",
271
+ "model.layers.32.self_attn.output_gate",
272
+ "model.layers.32.block_sparse_moe.gate",
273
+ "model.layers.33.self_attn.out_proj",
274
+ "model.layers.33.self_attn.qkv_proj",
275
+ "model.layers.33.self_attn.output_gate",
276
+ "model.layers.33.block_sparse_moe.gate",
277
+ "model.layers.34.self_attn.out_proj",
278
+ "model.layers.34.self_attn.qkv_proj",
279
+ "model.layers.34.self_attn.output_gate",
280
+ "model.layers.34.block_sparse_moe.gate",
281
+ "model.layers.35.self_attn.out_proj",
282
+ "model.layers.35.self_attn.qkv_proj",
283
+ "model.layers.35.self_attn.output_gate",
284
+ "model.layers.35.block_sparse_moe.gate",
285
+ "model.layers.36.self_attn.out_proj",
286
+ "model.layers.36.self_attn.qkv_proj",
287
+ "model.layers.36.self_attn.output_gate",
288
+ "model.layers.36.block_sparse_moe.gate",
289
+ "model.layers.37.self_attn.out_proj",
290
+ "model.layers.37.self_attn.qkv_proj",
291
+ "model.layers.37.self_attn.output_gate",
292
+ "model.layers.37.block_sparse_moe.gate",
293
+ "model.layers.38.self_attn.out_proj",
294
+ "model.layers.38.self_attn.qkv_proj",
295
+ "model.layers.38.self_attn.output_gate",
296
+ "model.layers.38.block_sparse_moe.gate",
297
+ "model.layers.39.self_attn.q_proj",
298
+ "model.layers.39.self_attn.k_proj",
299
+ "model.layers.39.self_attn.v_proj",
300
+ "model.layers.39.self_attn.o_proj",
301
+ "model.layers.39.block_sparse_moe.gate",
302
+ "model.layers.40.self_attn.out_proj",
303
+ "model.layers.40.self_attn.qkv_proj",
304
+ "model.layers.40.self_attn.output_gate",
305
+ "model.layers.40.block_sparse_moe.gate",
306
+ "model.layers.41.self_attn.out_proj",
307
+ "model.layers.41.self_attn.qkv_proj",
308
+ "model.layers.41.self_attn.output_gate",
309
+ "model.layers.41.block_sparse_moe.gate",
310
+ "model.layers.42.self_attn.out_proj",
311
+ "model.layers.42.self_attn.qkv_proj",
312
+ "model.layers.42.self_attn.output_gate",
313
+ "model.layers.42.block_sparse_moe.gate",
314
+ "model.layers.43.self_attn.out_proj",
315
+ "model.layers.43.self_attn.qkv_proj",
316
+ "model.layers.43.self_attn.output_gate",
317
+ "model.layers.43.block_sparse_moe.gate",
318
+ "model.layers.44.self_attn.out_proj",
319
+ "model.layers.44.self_attn.qkv_proj",
320
+ "model.layers.44.self_attn.output_gate",
321
+ "model.layers.44.block_sparse_moe.gate",
322
+ "model.layers.45.self_attn.out_proj",
323
+ "model.layers.45.self_attn.qkv_proj",
324
+ "model.layers.45.self_attn.output_gate",
325
+ "model.layers.45.block_sparse_moe.gate",
326
+ "model.layers.46.self_attn.out_proj",
327
+ "model.layers.46.self_attn.qkv_proj",
328
+ "model.layers.46.self_attn.output_gate",
329
+ "model.layers.46.block_sparse_moe.gate",
330
+ "model.layers.47.self_attn.q_proj",
331
+ "model.layers.47.self_attn.k_proj",
332
+ "model.layers.47.self_attn.v_proj",
333
+ "model.layers.47.self_attn.o_proj",
334
+ "model.layers.47.block_sparse_moe.gate",
335
+ "model.layers.48.self_attn.out_proj",
336
+ "model.layers.48.self_attn.qkv_proj",
337
+ "model.layers.48.self_attn.output_gate",
338
+ "model.layers.48.block_sparse_moe.gate",
339
+ "model.layers.49.self_attn.out_proj",
340
+ "model.layers.49.self_attn.qkv_proj",
341
+ "model.layers.49.self_attn.output_gate",
342
+ "model.layers.49.block_sparse_moe.gate",
343
+ "model.layers.50.self_attn.out_proj",
344
+ "model.layers.50.self_attn.qkv_proj",
345
+ "model.layers.50.self_attn.output_gate",
346
+ "model.layers.50.block_sparse_moe.gate",
347
+ "model.layers.51.self_attn.out_proj",
348
+ "model.layers.51.self_attn.qkv_proj",
349
+ "model.layers.51.self_attn.output_gate",
350
+ "model.layers.51.block_sparse_moe.gate",
351
+ "model.layers.52.self_attn.out_proj",
352
+ "model.layers.52.self_attn.qkv_proj",
353
+ "model.layers.52.self_attn.output_gate",
354
+ "model.layers.52.block_sparse_moe.gate",
355
+ "model.layers.53.self_attn.out_proj",
356
+ "model.layers.53.self_attn.qkv_proj",
357
+ "model.layers.53.self_attn.output_gate",
358
+ "model.layers.53.block_sparse_moe.gate",
359
+ "model.layers.54.self_attn.out_proj",
360
+ "model.layers.54.self_attn.qkv_proj",
361
+ "model.layers.54.self_attn.output_gate",
362
+ "model.layers.54.block_sparse_moe.gate",
363
+ "model.layers.55.self_attn.q_proj",
364
+ "model.layers.55.self_attn.k_proj",
365
+ "model.layers.55.self_attn.v_proj",
366
+ "model.layers.55.self_attn.o_proj",
367
+ "model.layers.55.block_sparse_moe.gate",
368
+ "model.layers.56.self_attn.out_proj",
369
+ "model.layers.56.self_attn.qkv_proj",
370
+ "model.layers.56.self_attn.output_gate",
371
+ "model.layers.56.block_sparse_moe.gate",
372
+ "model.layers.57.self_attn.out_proj",
373
+ "model.layers.57.self_attn.qkv_proj",
374
+ "model.layers.57.self_attn.output_gate",
375
+ "model.layers.57.block_sparse_moe.gate",
376
+ "model.layers.58.self_attn.out_proj",
377
+ "model.layers.58.self_attn.qkv_proj",
378
+ "model.layers.58.self_attn.output_gate",
379
+ "model.layers.58.block_sparse_moe.gate",
380
+ "model.layers.59.self_attn.out_proj",
381
+ "model.layers.59.self_attn.qkv_proj",
382
+ "model.layers.59.self_attn.output_gate",
383
+ "model.layers.59.block_sparse_moe.gate",
384
+ "model.layers.60.self_attn.out_proj",
385
+ "model.layers.60.self_attn.qkv_proj",
386
+ "model.layers.60.self_attn.output_gate",
387
+ "model.layers.60.block_sparse_moe.gate",
388
+ "model.layers.61.self_attn.out_proj",
389
+ "model.layers.61.self_attn.qkv_proj",
390
+ "model.layers.61.self_attn.output_gate",
391
+ "model.layers.61.block_sparse_moe.gate",
392
+ "model.layers.62.self_attn.out_proj",
393
+ "model.layers.62.self_attn.qkv_proj",
394
+ "model.layers.62.self_attn.output_gate",
395
+ "model.layers.62.block_sparse_moe.gate",
396
+ "model.layers.63.self_attn.q_proj",
397
+ "model.layers.63.self_attn.k_proj",
398
+ "model.layers.63.self_attn.v_proj",
399
+ "model.layers.63.self_attn.o_proj",
400
+ "model.layers.63.block_sparse_moe.gate",
401
+ "model.layers.64.self_attn.out_proj",
402
+ "model.layers.64.self_attn.qkv_proj",
403
+ "model.layers.64.self_attn.output_gate",
404
+ "model.layers.64.block_sparse_moe.gate",
405
+ "model.layers.65.self_attn.out_proj",
406
+ "model.layers.65.self_attn.qkv_proj",
407
+ "model.layers.65.self_attn.output_gate",
408
+ "model.layers.65.block_sparse_moe.gate",
409
+ "model.layers.66.self_attn.out_proj",
410
+ "model.layers.66.self_attn.qkv_proj",
411
+ "model.layers.66.self_attn.output_gate",
412
+ "model.layers.66.block_sparse_moe.gate",
413
+ "model.layers.67.self_attn.out_proj",
414
+ "model.layers.67.self_attn.qkv_proj",
415
+ "model.layers.67.self_attn.output_gate",
416
+ "model.layers.67.block_sparse_moe.gate",
417
+ "model.layers.68.self_attn.out_proj",
418
+ "model.layers.68.self_attn.qkv_proj",
419
+ "model.layers.68.self_attn.output_gate",
420
+ "model.layers.68.block_sparse_moe.gate",
421
+ "model.layers.69.self_attn.out_proj",
422
+ "model.layers.69.self_attn.qkv_proj",
423
+ "model.layers.69.self_attn.output_gate",
424
+ "model.layers.69.block_sparse_moe.gate",
425
+ "model.layers.70.self_attn.out_proj",
426
+ "model.layers.70.self_attn.qkv_proj",
427
+ "model.layers.70.self_attn.output_gate",
428
+ "model.layers.70.block_sparse_moe.gate",
429
+ "model.layers.71.self_attn.q_proj",
430
+ "model.layers.71.self_attn.k_proj",
431
+ "model.layers.71.self_attn.v_proj",
432
+ "model.layers.71.self_attn.o_proj",
433
+ "model.layers.71.block_sparse_moe.gate",
434
+ "model.layers.72.self_attn.out_proj",
435
+ "model.layers.72.self_attn.qkv_proj",
436
+ "model.layers.72.self_attn.output_gate",
437
+ "model.layers.72.block_sparse_moe.gate",
438
+ "model.layers.73.self_attn.out_proj",
439
+ "model.layers.73.self_attn.qkv_proj",
440
+ "model.layers.73.self_attn.output_gate",
441
+ "model.layers.73.block_sparse_moe.gate",
442
+ "model.layers.74.self_attn.out_proj",
443
+ "model.layers.74.self_attn.qkv_proj",
444
+ "model.layers.74.self_attn.output_gate",
445
+ "model.layers.74.block_sparse_moe.gate",
446
+ "model.layers.75.self_attn.out_proj",
447
+ "model.layers.75.self_attn.qkv_proj",
448
+ "model.layers.75.self_attn.output_gate",
449
+ "model.layers.75.block_sparse_moe.gate",
450
+ "model.layers.76.self_attn.out_proj",
451
+ "model.layers.76.self_attn.qkv_proj",
452
+ "model.layers.76.self_attn.output_gate",
453
+ "model.layers.76.block_sparse_moe.gate",
454
+ "model.layers.77.self_attn.out_proj",
455
+ "model.layers.77.self_attn.qkv_proj",
456
+ "model.layers.77.self_attn.output_gate",
457
+ "model.layers.77.block_sparse_moe.gate",
458
+ "model.layers.78.self_attn.out_proj",
459
+ "model.layers.78.self_attn.qkv_proj",
460
+ "model.layers.78.self_attn.output_gate",
461
+ "model.layers.78.block_sparse_moe.gate",
462
+ "model.layers.79.self_attn.q_proj",
463
+ "model.layers.79.self_attn.k_proj",
464
+ "model.layers.79.self_attn.v_proj",
465
+ "model.layers.79.self_attn.o_proj",
466
+ "model.layers.79.block_sparse_moe.gate",
467
+ "lm_head"
468
+ ],
469
+ "kv_cache_scheme": null,
470
+ "quant_method": "compressed-tensors",
471
+ "quantization_status": "compressed"
472
+ },
473
+ "rms_norm_eps": 1e-05,
474
+ "rope_theta": 10000000,
475
+ "rotary_dim": 64,
476
+ "router_aux_loss_coef": 0.001,
477
+ "router_jitter_noise": 0.0,
478
+ "shared_intermediate_size": 0,
479
+ "shared_moe_mode": "sigmoid",
480
+ "sliding_window": null,
481
+ "tie_word_embeddings": false,
482
+ "torch_dtype": "bfloat16",
483
+ "transformers_version": "4.52.4",
484
+ "use_cache": true,
485
+ "vocab_size": 200064
486
+ }
configuration_minimax_m1.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ MiniMaxM1 model configuration"""
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+
7
+ logger = logging.get_logger(__name__)
8
+
9
+
10
+ class MiniMaxM1Config(PretrainedConfig):
11
+ r"""
12
+ This is the configuration class to store the configuration of a [`MiniMaxM1Model`]. It is used to instantiate an
13
+ MiniMaxM1 model according to the specified arguments, defining the model architecture. Instantiating a configuration
14
+ with the defaults will yield a similar configuration to that of the MiniMaxM1.
15
+
16
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
17
+ documentation from [`PretrainedConfig`] for more information.
18
+
19
+
20
+ Args:
21
+ vocab_size (`int`, *optional*, defaults to 32000):
22
+ Vocabulary size of the MiniMaxM1 model. Defines the number of different tokens that can be represented by the
23
+ `inputs_ids` passed when calling [`MiniMaxM1Model`]
24
+ hidden_size (`int`, *optional*, defaults to 4096):
25
+ Dimension of the hidden representations.
26
+ intermediate_size (`int`, *optional*, defaults to 14336):
27
+ Dimension of the MLP representations.
28
+ num_hidden_layers (`int`, *optional*, defaults to 32):
29
+ Number of hidden layers in the Transformer encoder.
30
+ num_attention_heads (`int`, *optional*, defaults to 32):
31
+ Number of attention heads for each attention layer in the Transformer encoder.
32
+ num_key_value_heads (`int`, *optional*, defaults to 8):
33
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
34
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
35
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
36
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
37
+ by meanpooling all the original heads within that group. For more details checkout [this
38
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
39
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
40
+ The non-linear activation function (function or string) in the decoder.
41
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
42
+ The maximum sequence length that this model might ever be used with. MiniMaxM1's sliding window attention
43
+ allows sequence of up to 4096*32 tokens.
44
+ initializer_range (`float`, *optional*, defaults to 0.02):
45
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
46
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
47
+ The epsilon used by the rms normalization layers.
48
+ use_cache (`bool`, *optional*, defaults to `True`):
49
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
50
+ relevant if `config.is_decoder=True`.
51
+ pad_token_id (`int`, *optional*):
52
+ The id of the padding token.
53
+ bos_token_id (`int`, *optional*, defaults to 1):
54
+ The id of the "beginning-of-sequence" token.
55
+ eos_token_id (`int`, *optional*, defaults to 2):
56
+ The id of the "end-of-sequence" token.
57
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
58
+ Whether the model's input and output word embeddings should be tied.
59
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
60
+ The base period of the RoPE embeddings.
61
+ sliding_window (`int`, *optional*):
62
+ Sliding window attention window size. If not specified, will default to `4096`.
63
+ attention_dropout (`float`, *optional*, defaults to 0.0):
64
+ The dropout ratio for the attention probabilities.
65
+ num_experts_per_tok (`int`, *optional*, defaults to 2):
66
+ The number of experts to route per-token, can be also interpreted as the `top-k` routing
67
+ parameter
68
+ num_local_experts (`int`, *optional*, defaults to 8):
69
+ Number of experts per Sparse MLP layer.
70
+ output_router_logits (`bool`, *optional*, defaults to `False`):
71
+ Whether or not the router logits should be returned by the model. Enabeling this will also
72
+ allow the model to output the auxiliary loss. See [here]() for more details
73
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
74
+ The aux loss factor for the total loss.
75
+ router_jitter_noise (`float`, *optional*, defaults to 0.0):
76
+ Amount of noise to add to the router.
77
+
78
+ ```python
79
+ >>> from transformers import MiniMaxM1Model, MiniMaxM1Config
80
+
81
+ >>> # Initializing a MiniMaxM1 style configuration
82
+ >>> configuration = MiniMaxM1Config()
83
+
84
+ >>> # Initializing a model from the MiniMaxM1 style configuration
85
+ >>> model = MiniMaxM1Model(configuration)
86
+
87
+ >>> # Accessing the model configuration
88
+ >>> configuration = model.config
89
+ ```"""
90
+
91
+ model_type = "MiniMaxM1"
92
+ keys_to_ignore_at_inference = ["past_key_values"]
93
+
94
+ def __init__(
95
+ self,
96
+ vocab_size=32000,
97
+ hidden_size=4096,
98
+ intermediate_size=14336,
99
+ num_hidden_layers=32,
100
+ num_attention_heads=32,
101
+ num_key_value_heads=8,
102
+ hidden_act="silu",
103
+ max_position_embeddings=4096 * 32,
104
+ initializer_range=0.02,
105
+ rms_norm_eps=1e-5,
106
+ use_cache=True,
107
+ pad_token_id=None,
108
+ bos_token_id=None,
109
+ eos_token_id=None,
110
+ tie_word_embeddings=False,
111
+ rope_theta=1e6,
112
+ sliding_window=None,
113
+ attention_dropout=0.0,
114
+ num_experts_per_tok=2,
115
+ num_local_experts=8,
116
+ output_router_logits=False,
117
+ router_aux_loss_coef=0.001,
118
+ router_jitter_noise=0.0,
119
+ **kwargs,
120
+ ):
121
+ self.vocab_size = vocab_size
122
+ self.max_position_embeddings = max_position_embeddings
123
+ self.hidden_size = hidden_size
124
+ self.intermediate_size = intermediate_size
125
+ self.num_hidden_layers = num_hidden_layers
126
+ self.num_attention_heads = num_attention_heads
127
+ self.sliding_window = sliding_window
128
+
129
+ # for backward compatibility
130
+ if num_key_value_heads is None:
131
+ num_key_value_heads = num_attention_heads
132
+
133
+ self.num_key_value_heads = num_key_value_heads
134
+ self.hidden_act = hidden_act
135
+ self.initializer_range = initializer_range
136
+ self.rms_norm_eps = rms_norm_eps
137
+ self.use_cache = use_cache
138
+ self.rope_theta = rope_theta
139
+ self.attention_dropout = attention_dropout
140
+
141
+ self.num_experts_per_tok = num_experts_per_tok
142
+ self.num_local_experts = num_local_experts
143
+ self.output_router_logits = output_router_logits
144
+ self.router_aux_loss_coef = router_aux_loss_coef
145
+ self.router_jitter_noise = router_jitter_noise
146
+ super().__init__(
147
+ pad_token_id=pad_token_id,
148
+ bos_token_id=bos_token_id,
149
+ eos_token_id=eos_token_id,
150
+ tie_word_embeddings=tie_word_embeddings,
151
+ **kwargs,
152
+ )
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.52.4"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6204b0320fe59eec5b6fd1cfb61ef8f521a74039445f7a77d9777edaf57ca46
3
+ size 4976684776
model-00002-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b43d307508d0dad5f5559b91f5d3b38d4019f038440e49bdbf3b1db0ae3209ec
3
+ size 4978252672
model-00003-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d385b4fe4b83b11226c36fe85110eddc5c35032e2ea17e3d122ea918f1e2f2d6
3
+ size 5000041624
model-00004-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94917e1b2eba7ed35c1159285a43a4bb35966328ac87cc1420ddd0f4bdcd4ea0
3
+ size 4978252664
model-00005-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37da1e4c4db8b6c3e8cfcbf4bed1791b6698df15d78004b96131bd36a0df3fbd
3
+ size 5000041632
model-00006-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d1c06dedb64a24c85e135c8279d3bcab1e20a6713eeff89c3d8d51276d8267c
3
+ size 4993379088
model-00007-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0684c79ae07e7140b526464657e3e3ebcee8e263749c5ffecb7bd0cd66df136f
3
+ size 5000041664
model-00008-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7e393b8de5475c218afc7bfe1ad687692129a0e592a1d6322b08337fa4eb87e
3
+ size 4978253008
model-00009-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9f52c14d62245cf17e1e6e2018864f91b6f9b5762f61b5ad799304409e8cdf4
3
+ size 5000042144
model-00010-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f7aa27b214cecd8eefe675a56a2065849232faec54898ae4b2536c4fea14327
3
+ size 4978253040
model-00011-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff19e899974d0b33bdf329b1229555084c8d135957739086a5c69868be811f3
3
+ size 4927603232
model-00012-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dec8d6cb5ee1dd99737bef7cf2d183e91c364fb111eeff9073aaba0790bdf20
3
+ size 4978228368
model-00013-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1be20aff0461f959b8324f7cb35c80c72d4238922cc7151a589d2dce933defd0
3
+ size 4941673432
model-00014-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef0832b98e8ccfd70bbac12d2f5f44a481af8d6279a54e0ebb8979ef928820c8
3
+ size 4978228368
model-00015-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b1e999865cd32638fb5e773a4e2019f303187ae06fba32e2b660d8cdf4864dd
3
+ size 4941673432
model-00016-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d7b9e20f911da59215688f1727151d4d73e6d64303fd3d145cd28f0af3e82f4
3
+ size 4993354856
model-00017-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21dc9070d687e0cff8f34d100848213eef8e49ec366ffe335f568b851ff0e0d6
3
+ size 4750386400
model-00018-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e77ccabd5c15319224bc7318d89d719751ec8839891e7db3fb247bf195dbab8e
3
+ size 4994335232
model-00019-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4853ffa3871c04b86d24d5b7755e02ca501e20836b7e2042539c8f4aa63a8b6
3
+ size 4925566568
model-00020-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:411fd9b8c00a277ce2f3f4b038a0ffa1d79d8c88062c0625cc50bcebdd9caf24
3
+ size 4994335232
model-00021-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2075c8837306dc98661824c7fc4338729f62eae0611acb718c3459f80d201c0
3
+ size 4950716128
model-00022-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:439c726c259b4a55c2585e2957ffc91110e1d50f686305802d609a404d059daa
3
+ size 4984312160
model-00023-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adfe1bf49e812a23ad5512e3f90edf80b0f173ed93ea7c53b1844bce0c0b755d
3
+ size 4935589632
model-00024-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ae749f9f6db6da7523f0434172e55aa4cb574d5d4d53aef67db2322c88b84b4
3
+ size 4984312168
model-00025-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f95ae73c02f70f41d77551a5f0e698bfaed8b354a484f88804d1ed91fdabf77e
3
+ size 4935589632
model-00026-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aabe0e9d99453c3fe54b45da0327fe8d0d65bfac3912b7e05e3b1f9050727c9f
3
+ size 4984312168
model-00027-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c34182f82678d954ea128d63ed703645b3e758ded2484b148758e472348d716
3
+ size 4993379600
model-00028-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e01e79df76689d26508c3199a412afa5fd77b9608273e43213618a65098e1cb
3
+ size 5000042080
model-00029-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5000ec848fa1903220c8b4e08d10de6ea592c9d64fdef8b14c3b617d64885393
3
+ size 4978253104
model-00030-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:825c337593471bd22752f7373bfa61c8038867ba43b67589ddb7a6018f9cfae7
3
+ size 5000042088
model-00031-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e0c294928eae7aa4ddf901c922cc5f5ab5932ca94e4d1cb7b9ee775918da617
3
+ size 4978253096
model-00032-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e37371c304394aac49c96c8246b528d9c74f703511fd02e0ef09931ca0c9e49
3
+ size 4985971888
model-00033-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca5c4283ed6ff2eada41692e414ff6c96c4dee177f1a539c906f240429180fe
3
+ size 4978253064
model-00034-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7170567c428cb1476fb68d5f3af02ad719663ca647376b78173b8f4584ea814
3
+ size 5000042128
model-00035-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a094ac609531b7821e778509f34f0287c8c6eb5b9821ec772006320f67262746
3
+ size 4978253056
model-00036-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe18d3d032892d6e0a4c3d23c88f758c8e6b71bcb5de5d11e081421326e80a26
3
+ size 5000042128
model-00037-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b21934a416599ed2cc252f1b2155fc7371ccdfc331ccc5c6c94a550b224db101
3
+ size 4993379536
model-00038-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d42cf2eb086e0235ba81c82d5be68bcae07c1d1b2f9ab06ff8fe3da2e9b240f5
3
+ size 5000042144
model-00039-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee0fa9c72e2da22bbf15e4124f619da4490f697d0acf947868158ed8e503b35
3
+ size 4978253040
model-00040-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdcd6dbe350d91526d73628c332a7648290b393928d969e927b33d6afcffefb4
3
+ size 4970870128
model-00041-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d922d089da27024b9d5d63b7dff99549a2b19bd4a4b6497f646c2c375b4f45c
3
+ size 4978228368
model-00042-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d60910c3eb7c8b2c235e05465a5bef8a6a2fa68caa8e6abc7da74032cb7e27f
3
+ size 4941673432
model-00043-of-00054.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4dfb3908bb4ac04da88b1333548ab20461e9e656a7d10242f584bd5a1aea9cb
3
+ size 4993354856