Safetensors
qwen3
ehartford commited on
Commit
88841e4
·
verified ·
1 Parent(s): 09f6bf9

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 8192,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 29568,
14
+ "max_position_embeddings": 40960,
15
+ "max_window_layers": 64,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 64,
18
+ "num_hidden_layers": 80,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.51.3",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
model-00001-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61d8531d1f81d1383acf3bbbac9cc86799c1ab723f4ca6d6e3b0f9a9b9cbbd25
3
+ size 4978672024
model-00002-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d6b57a4d4e5f727556f8f0dd5c5dcbf45746d450265b09a909ea1fd4d9b1180
3
+ size 4980822416
model-00003-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e449ae2d61491564ab3f88a2a07a2680766d322ae233dc9b8bcbe908c9056f60
3
+ size 4582381232
model-00004-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48fbe2ced8e5d7f3f84b8be1f2a64986151c637c48400617db7ae615adb4975b
3
+ size 4964044712
model-00005-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8592a3b57d0a67b8d8fce74f6b84cecc5096f250104fe5dba621cbd149389d03
3
+ size 4781593584
model-00006-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4de35e7d8a69581677277a1fe318635d03ebe1ec92fac250f81ccf45c44832a6
3
+ size 4980838936
model-00007-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d63efa2cefba168ce668100cbe8c3b0569d339d794f6c59df0032f79d8656df
3
+ size 4582381264
model-00008-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:204c31141374e12802b99f554238889608219c16dca1cecbd946c32ce2ff6f9f
3
+ size 4964028240
model-00009-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93794680f27d50d028a1c99cff2eae192aff3e61466022e254e7042caddbdc2d
3
+ size 4781626592
model-00010-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fdf85fbbcbec6581335017ecb8c4c1d4b047b71d6c2c385f88281f980a78f04
3
+ size 4980822448
model-00011-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cd0a5d9651fc545b144793285ab67899a4e0143f626053e83dd6df7cdaf4ad8
3
+ size 4764815896
model-00012-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0ed72485fefc060baf3e4aec6e4a201ae6f52409740dd71baba212d6bb03c73
3
+ size 4781610096
model-00013-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:045224aa955cc8ba6785401226d2e816d3d7cbb26647be51cc4ec7bcc702bea7
3
+ size 4781610104
model-00014-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d68bfa4d4d99c27033d9b028f83cad1ea32c80ee6ea21a33569b3a0aa8c3695e
3
+ size 4980822448
model-00015-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad7858bab59b3e4f76bc5cf046656de51899e9a9416dda5cf2bd0c4a0ab52fbc
3
+ size 4764815896
model-00016-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:350e0b89e9a0f93f9e1f7ba1e6f830c41c38ffdf813bb10b15f9ecdf810b8258
3
+ size 4781610096
model-00017-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:503fa67d7d5bb56599f46fb7ad4a5e7e3d8f0a79d9d02a51e608e50fe71443d8
3
+ size 4781610104
model-00018-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4945e879dc16e52f53cd72850c88bad8f0eb7627ca7e0cab6238bd674a1e0a7e
3
+ size 4980822448
model-00019-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a276ac969178765a11b24009bb26a4d794c46dc0366957e76ca66a5e0d026bd5
3
+ size 4764815896
model-00020-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b841082f9bfcf23a1beed49631e60d90e3dabcd2ffafbcf2653f0eddc9d313
3
+ size 4932606008
model-00021-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b780c1c411b0b153da5a06313b17145373ccbd129548cc37c974e5582f51c4a
3
+ size 4630614192
model-00022-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cce4c30cc5efad79a98e7a0eee32fd6b9425d3181ab1fc986dea546207db4785
3
+ size 4781593616
model-00023-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99c19833f5be9d4a3640dbb4da8b4145f465b1ac3e5417e3fd0828b13ce59c2b
3
+ size 4964044736
model-00024-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e571dacedd834da1cc739297b61a7adaa2a9e8edc26b6d76c3c5d6aa52017d9
3
+ size 4781610096
model-00025-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cb4457897c90b6c30fb1687107f13c8b1b9b6bccafb84478f00f9a4891378ad
3
+ size 4781610104
model-00026-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc6efe9bd5d36485f6e6ed48cec49739209d644571d862979e1a07707cdfd3e6
3
+ size 4980822448
model-00027-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d0d0da582857f590f8b4c34cb176a6ddc6cf80fc55125da7ff249fa8234ccd7
3
+ size 4764815896
model-00028-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4cf90c45bb46f9c2c29adadf00acdb41ff760309296025e12930f885c882dc6
3
+ size 4781610096
model-00029-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b96092f2efa70a7ece0b01f4d2286fc6f3583a311244d2f9b823003db62384d
3
+ size 4781610104
model-00030-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352997a6410ad74fcab769182d1a8de0c248d2d79fc3be3dd366b46de4748528
3
+ size 4781593616
model-00031-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e52be5d002bb99434fbb60481509eafcfe548966f2f73b6fc344713c955b82b6
3
+ size 484442232
model.safetensors.index.json ADDED
@@ -0,0 +1,890 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 145406615552
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00001-of-00031.safetensors",
7
+ "model.norm.weight": "model-00001-of-00031.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00031.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00031.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
14
+ "model.layers.0.self_attn.k_norm.weight": "model-00002-of-00031.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00031.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00031.safetensors",
17
+ "model.layers.0.self_attn.q_norm.weight": "model-00002-of-00031.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00031.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00031.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00002-of-00031.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
25
+ "model.layers.1.self_attn.k_norm.weight": "model-00002-of-00031.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00031.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00031.safetensors",
28
+ "model.layers.1.self_attn.q_norm.weight": "model-00002-of-00031.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00031.safetensors",
30
+ "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00031.safetensors",
31
+ "model.layers.2.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
32
+ "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
33
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00031.safetensors",
34
+ "model.layers.2.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
35
+ "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
36
+ "model.layers.2.self_attn.k_norm.weight": "model-00002-of-00031.safetensors",
37
+ "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00031.safetensors",
38
+ "model.layers.2.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
39
+ "model.layers.2.self_attn.q_norm.weight": "model-00003-of-00031.safetensors",
40
+ "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
41
+ "model.layers.2.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
42
+ "model.layers.3.input_layernorm.weight": "model-00003-of-00031.safetensors",
43
+ "model.layers.3.mlp.down_proj.weight": "model-00003-of-00031.safetensors",
44
+ "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00031.safetensors",
45
+ "model.layers.3.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
46
+ "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00031.safetensors",
47
+ "model.layers.3.self_attn.k_norm.weight": "model-00003-of-00031.safetensors",
48
+ "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
49
+ "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
50
+ "model.layers.3.self_attn.q_norm.weight": "model-00003-of-00031.safetensors",
51
+ "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
52
+ "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
53
+ "model.layers.4.input_layernorm.weight": "model-00003-of-00031.safetensors",
54
+ "model.layers.4.mlp.down_proj.weight": "model-00003-of-00031.safetensors",
55
+ "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00031.safetensors",
56
+ "model.layers.4.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
57
+ "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00031.safetensors",
58
+ "model.layers.4.self_attn.k_norm.weight": "model-00003-of-00031.safetensors",
59
+ "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
60
+ "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
61
+ "model.layers.4.self_attn.q_norm.weight": "model-00003-of-00031.safetensors",
62
+ "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
63
+ "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
64
+ "model.layers.5.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
65
+ "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00031.safetensors",
66
+ "model.layers.5.self_attn.k_norm.weight": "model-00003-of-00031.safetensors",
67
+ "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
68
+ "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
69
+ "model.layers.5.self_attn.q_norm.weight": "model-00003-of-00031.safetensors",
70
+ "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
71
+ "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
72
+ "model.layers.5.input_layernorm.weight": "model-00003-of-00031.safetensors",
73
+ "model.layers.5.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
74
+ "model.layers.5.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
75
+ "model.layers.6.input_layernorm.weight": "model-00004-of-00031.safetensors",
76
+ "model.layers.6.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
77
+ "model.layers.6.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
78
+ "model.layers.6.mlp.up_proj.weight": "model-00004-of-00031.safetensors",
79
+ "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00031.safetensors",
80
+ "model.layers.6.self_attn.k_norm.weight": "model-00004-of-00031.safetensors",
81
+ "model.layers.6.self_attn.k_proj.weight": "model-00004-of-00031.safetensors",
82
+ "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00031.safetensors",
83
+ "model.layers.6.self_attn.q_norm.weight": "model-00004-of-00031.safetensors",
84
+ "model.layers.6.self_attn.q_proj.weight": "model-00004-of-00031.safetensors",
85
+ "model.layers.6.self_attn.v_proj.weight": "model-00004-of-00031.safetensors",
86
+ "model.layers.7.input_layernorm.weight": "model-00004-of-00031.safetensors",
87
+ "model.layers.7.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
88
+ "model.layers.7.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
89
+ "model.layers.7.mlp.up_proj.weight": "model-00004-of-00031.safetensors",
90
+ "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00031.safetensors",
91
+ "model.layers.7.self_attn.k_norm.weight": "model-00004-of-00031.safetensors",
92
+ "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00031.safetensors",
93
+ "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00031.safetensors",
94
+ "model.layers.7.self_attn.q_norm.weight": "model-00004-of-00031.safetensors",
95
+ "model.layers.7.self_attn.q_proj.weight": "model-00004-of-00031.safetensors",
96
+ "model.layers.7.self_attn.v_proj.weight": "model-00004-of-00031.safetensors",
97
+ "model.layers.8.input_layernorm.weight": "model-00004-of-00031.safetensors",
98
+ "model.layers.8.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
99
+ "model.layers.8.mlp.gate_proj.weight": "model-00005-of-00031.safetensors",
100
+ "model.layers.8.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
101
+ "model.layers.8.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
102
+ "model.layers.8.self_attn.k_norm.weight": "model-00005-of-00031.safetensors",
103
+ "model.layers.8.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
104
+ "model.layers.8.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
105
+ "model.layers.8.self_attn.q_norm.weight": "model-00005-of-00031.safetensors",
106
+ "model.layers.8.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
107
+ "model.layers.8.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
108
+ "model.layers.9.input_layernorm.weight": "model-00005-of-00031.safetensors",
109
+ "model.layers.9.mlp.down_proj.weight": "model-00005-of-00031.safetensors",
110
+ "model.layers.9.mlp.gate_proj.weight": "model-00005-of-00031.safetensors",
111
+ "model.layers.9.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
112
+ "model.layers.9.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
113
+ "model.layers.9.self_attn.k_norm.weight": "model-00005-of-00031.safetensors",
114
+ "model.layers.9.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
115
+ "model.layers.9.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
116
+ "model.layers.9.self_attn.q_norm.weight": "model-00005-of-00031.safetensors",
117
+ "model.layers.9.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
118
+ "model.layers.9.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
119
+ "model.layers.10.input_layernorm.weight": "model-00005-of-00031.safetensors",
120
+ "model.layers.10.mlp.down_proj.weight": "model-00005-of-00031.safetensors",
121
+ "model.layers.10.mlp.gate_proj.weight": "model-00005-of-00031.safetensors",
122
+ "model.layers.10.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
123
+ "model.layers.10.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
124
+ "model.layers.10.self_attn.k_norm.weight": "model-00005-of-00031.safetensors",
125
+ "model.layers.10.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
126
+ "model.layers.10.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
127
+ "model.layers.10.self_attn.q_norm.weight": "model-00005-of-00031.safetensors",
128
+ "model.layers.10.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
129
+ "model.layers.10.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
130
+ "model.layers.11.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
131
+ "model.layers.11.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
132
+ "model.layers.11.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
133
+ "model.layers.11.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
134
+ "model.layers.11.self_attn.k_norm.weight": "model-00006-of-00031.safetensors",
135
+ "model.layers.11.self_attn.k_proj.weight": "model-00006-of-00031.safetensors",
136
+ "model.layers.11.self_attn.o_proj.weight": "model-00006-of-00031.safetensors",
137
+ "model.layers.11.self_attn.q_norm.weight": "model-00006-of-00031.safetensors",
138
+ "model.layers.11.self_attn.q_proj.weight": "model-00006-of-00031.safetensors",
139
+ "model.layers.11.self_attn.v_proj.weight": "model-00006-of-00031.safetensors",
140
+ "model.layers.11.input_layernorm.weight": "model-00006-of-00031.safetensors",
141
+ "model.layers.12.input_layernorm.weight": "model-00006-of-00031.safetensors",
142
+ "model.layers.12.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
143
+ "model.layers.12.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
144
+ "model.layers.12.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
145
+ "model.layers.12.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
146
+ "model.layers.12.self_attn.k_norm.weight": "model-00006-of-00031.safetensors",
147
+ "model.layers.12.self_attn.k_proj.weight": "model-00006-of-00031.safetensors",
148
+ "model.layers.12.self_attn.o_proj.weight": "model-00006-of-00031.safetensors",
149
+ "model.layers.12.self_attn.q_norm.weight": "model-00006-of-00031.safetensors",
150
+ "model.layers.12.self_attn.q_proj.weight": "model-00006-of-00031.safetensors",
151
+ "model.layers.12.self_attn.v_proj.weight": "model-00006-of-00031.safetensors",
152
+ "model.layers.13.input_layernorm.weight": "model-00006-of-00031.safetensors",
153
+ "model.layers.13.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
154
+ "model.layers.13.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
155
+ "model.layers.13.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
156
+ "model.layers.13.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
157
+ "model.layers.13.self_attn.k_norm.weight": "model-00006-of-00031.safetensors",
158
+ "model.layers.13.self_attn.k_proj.weight": "model-00006-of-00031.safetensors",
159
+ "model.layers.13.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
160
+ "model.layers.13.self_attn.q_norm.weight": "model-00007-of-00031.safetensors",
161
+ "model.layers.13.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
162
+ "model.layers.13.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
163
+ "model.layers.14.input_layernorm.weight": "model-00007-of-00031.safetensors",
164
+ "model.layers.14.mlp.down_proj.weight": "model-00007-of-00031.safetensors",
165
+ "model.layers.14.mlp.gate_proj.weight": "model-00007-of-00031.safetensors",
166
+ "model.layers.14.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
167
+ "model.layers.14.post_attention_layernorm.weight": "model-00007-of-00031.safetensors",
168
+ "model.layers.14.self_attn.k_norm.weight": "model-00007-of-00031.safetensors",
169
+ "model.layers.14.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
170
+ "model.layers.14.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
171
+ "model.layers.14.self_attn.q_norm.weight": "model-00007-of-00031.safetensors",
172
+ "model.layers.14.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
173
+ "model.layers.14.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
174
+ "model.layers.15.input_layernorm.weight": "model-00007-of-00031.safetensors",
175
+ "model.layers.15.mlp.down_proj.weight": "model-00007-of-00031.safetensors",
176
+ "model.layers.15.mlp.gate_proj.weight": "model-00007-of-00031.safetensors",
177
+ "model.layers.15.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
178
+ "model.layers.15.post_attention_layernorm.weight": "model-00007-of-00031.safetensors",
179
+ "model.layers.15.self_attn.k_norm.weight": "model-00007-of-00031.safetensors",
180
+ "model.layers.15.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
181
+ "model.layers.15.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
182
+ "model.layers.15.self_attn.q_norm.weight": "model-00007-of-00031.safetensors",
183
+ "model.layers.15.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
184
+ "model.layers.15.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
185
+ "model.layers.16.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
186
+ "model.layers.16.post_attention_layernorm.weight": "model-00007-of-00031.safetensors",
187
+ "model.layers.16.self_attn.k_norm.weight": "model-00007-of-00031.safetensors",
188
+ "model.layers.16.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
189
+ "model.layers.16.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
190
+ "model.layers.16.self_attn.q_norm.weight": "model-00007-of-00031.safetensors",
191
+ "model.layers.16.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
192
+ "model.layers.16.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
193
+ "model.layers.16.input_layernorm.weight": "model-00007-of-00031.safetensors",
194
+ "model.layers.16.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
195
+ "model.layers.16.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
196
+ "model.layers.17.input_layernorm.weight": "model-00008-of-00031.safetensors",
197
+ "model.layers.17.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
198
+ "model.layers.17.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
199
+ "model.layers.17.mlp.up_proj.weight": "model-00008-of-00031.safetensors",
200
+ "model.layers.17.post_attention_layernorm.weight": "model-00008-of-00031.safetensors",
201
+ "model.layers.17.self_attn.k_norm.weight": "model-00008-of-00031.safetensors",
202
+ "model.layers.17.self_attn.k_proj.weight": "model-00008-of-00031.safetensors",
203
+ "model.layers.17.self_attn.o_proj.weight": "model-00008-of-00031.safetensors",
204
+ "model.layers.17.self_attn.q_norm.weight": "model-00008-of-00031.safetensors",
205
+ "model.layers.17.self_attn.q_proj.weight": "model-00008-of-00031.safetensors",
206
+ "model.layers.17.self_attn.v_proj.weight": "model-00008-of-00031.safetensors",
207
+ "model.layers.18.input_layernorm.weight": "model-00008-of-00031.safetensors",
208
+ "model.layers.18.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
209
+ "model.layers.18.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
210
+ "model.layers.18.mlp.up_proj.weight": "model-00008-of-00031.safetensors",
211
+ "model.layers.18.post_attention_layernorm.weight": "model-00008-of-00031.safetensors",
212
+ "model.layers.18.self_attn.k_norm.weight": "model-00008-of-00031.safetensors",
213
+ "model.layers.18.self_attn.k_proj.weight": "model-00008-of-00031.safetensors",
214
+ "model.layers.18.self_attn.o_proj.weight": "model-00008-of-00031.safetensors",
215
+ "model.layers.18.self_attn.q_norm.weight": "model-00008-of-00031.safetensors",
216
+ "model.layers.18.self_attn.q_proj.weight": "model-00008-of-00031.safetensors",
217
+ "model.layers.18.self_attn.v_proj.weight": "model-00008-of-00031.safetensors",
218
+ "model.layers.19.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
219
+ "model.layers.19.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
220
+ "model.layers.19.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
221
+ "model.layers.19.self_attn.k_norm.weight": "model-00009-of-00031.safetensors",
222
+ "model.layers.19.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
223
+ "model.layers.19.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
224
+ "model.layers.19.self_attn.q_norm.weight": "model-00009-of-00031.safetensors",
225
+ "model.layers.19.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
226
+ "model.layers.19.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
227
+ "model.layers.19.input_layernorm.weight": "model-00009-of-00031.safetensors",
228
+ "model.layers.19.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
229
+ "model.layers.20.input_layernorm.weight": "model-00009-of-00031.safetensors",
230
+ "model.layers.20.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
231
+ "model.layers.20.mlp.gate_proj.weight": "model-00009-of-00031.safetensors",
232
+ "model.layers.20.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
233
+ "model.layers.20.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
234
+ "model.layers.20.self_attn.k_norm.weight": "model-00009-of-00031.safetensors",
235
+ "model.layers.20.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
236
+ "model.layers.20.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
237
+ "model.layers.20.self_attn.q_norm.weight": "model-00009-of-00031.safetensors",
238
+ "model.layers.20.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
239
+ "model.layers.20.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
240
+ "model.layers.21.input_layernorm.weight": "model-00009-of-00031.safetensors",
241
+ "model.layers.21.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
242
+ "model.layers.21.mlp.gate_proj.weight": "model-00009-of-00031.safetensors",
243
+ "model.layers.21.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
244
+ "model.layers.21.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
245
+ "model.layers.21.self_attn.k_norm.weight": "model-00009-of-00031.safetensors",
246
+ "model.layers.21.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
247
+ "model.layers.21.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
248
+ "model.layers.21.self_attn.q_norm.weight": "model-00009-of-00031.safetensors",
249
+ "model.layers.21.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
250
+ "model.layers.21.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
251
+ "model.layers.22.input_layernorm.weight": "model-00009-of-00031.safetensors",
252
+ "model.layers.22.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
253
+ "model.layers.22.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
254
+ "model.layers.22.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
255
+ "model.layers.22.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
256
+ "model.layers.22.self_attn.k_norm.weight": "model-00010-of-00031.safetensors",
257
+ "model.layers.22.self_attn.k_proj.weight": "model-00010-of-00031.safetensors",
258
+ "model.layers.22.self_attn.o_proj.weight": "model-00010-of-00031.safetensors",
259
+ "model.layers.22.self_attn.q_norm.weight": "model-00010-of-00031.safetensors",
260
+ "model.layers.22.self_attn.q_proj.weight": "model-00010-of-00031.safetensors",
261
+ "model.layers.22.self_attn.v_proj.weight": "model-00010-of-00031.safetensors",
262
+ "model.layers.23.input_layernorm.weight": "model-00010-of-00031.safetensors",
263
+ "model.layers.23.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
264
+ "model.layers.23.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
265
+ "model.layers.23.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
266
+ "model.layers.23.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
267
+ "model.layers.23.self_attn.k_norm.weight": "model-00010-of-00031.safetensors",
268
+ "model.layers.23.self_attn.k_proj.weight": "model-00010-of-00031.safetensors",
269
+ "model.layers.23.self_attn.o_proj.weight": "model-00010-of-00031.safetensors",
270
+ "model.layers.23.self_attn.q_norm.weight": "model-00010-of-00031.safetensors",
271
+ "model.layers.23.self_attn.q_proj.weight": "model-00010-of-00031.safetensors",
272
+ "model.layers.23.self_attn.v_proj.weight": "model-00010-of-00031.safetensors",
273
+ "model.layers.24.input_layernorm.weight": "model-00010-of-00031.safetensors",
274
+ "model.layers.24.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
275
+ "model.layers.24.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
276
+ "model.layers.24.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
277
+ "model.layers.24.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
278
+ "model.layers.24.self_attn.k_norm.weight": "model-00010-of-00031.safetensors",
279
+ "model.layers.24.self_attn.k_proj.weight": "model-00010-of-00031.safetensors",
280
+ "model.layers.24.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
281
+ "model.layers.24.self_attn.q_norm.weight": "model-00011-of-00031.safetensors",
282
+ "model.layers.24.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
283
+ "model.layers.24.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
284
+ "model.layers.25.input_layernorm.weight": "model-00011-of-00031.safetensors",
285
+ "model.layers.25.mlp.down_proj.weight": "model-00011-of-00031.safetensors",
286
+ "model.layers.25.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
287
+ "model.layers.25.mlp.up_proj.weight": "model-00011-of-00031.safetensors",
288
+ "model.layers.25.post_attention_layernorm.weight": "model-00011-of-00031.safetensors",
289
+ "model.layers.25.self_attn.k_norm.weight": "model-00011-of-00031.safetensors",
290
+ "model.layers.25.self_attn.k_proj.weight": "model-00011-of-00031.safetensors",
291
+ "model.layers.25.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
292
+ "model.layers.25.self_attn.q_norm.weight": "model-00011-of-00031.safetensors",
293
+ "model.layers.25.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
294
+ "model.layers.25.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
295
+ "model.layers.26.input_layernorm.weight": "model-00011-of-00031.safetensors",
296
+ "model.layers.26.mlp.down_proj.weight": "model-00011-of-00031.safetensors",
297
+ "model.layers.26.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
298
+ "model.layers.26.mlp.up_proj.weight": "model-00011-of-00031.safetensors",
299
+ "model.layers.26.post_attention_layernorm.weight": "model-00011-of-00031.safetensors",
300
+ "model.layers.26.self_attn.k_norm.weight": "model-00011-of-00031.safetensors",
301
+ "model.layers.26.self_attn.k_proj.weight": "model-00011-of-00031.safetensors",
302
+ "model.layers.26.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
303
+ "model.layers.26.self_attn.q_norm.weight": "model-00011-of-00031.safetensors",
304
+ "model.layers.26.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
305
+ "model.layers.26.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
306
+ "model.layers.27.input_layernorm.weight": "model-00011-of-00031.safetensors",
307
+ "model.layers.27.mlp.down_proj.weight": "model-00011-of-00031.safetensors",
308
+ "model.layers.27.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
309
+ "model.layers.27.mlp.up_proj.weight": "model-00012-of-00031.safetensors",
310
+ "model.layers.27.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
311
+ "model.layers.27.self_attn.k_norm.weight": "model-00012-of-00031.safetensors",
312
+ "model.layers.27.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
313
+ "model.layers.27.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
314
+ "model.layers.27.self_attn.q_norm.weight": "model-00012-of-00031.safetensors",
315
+ "model.layers.27.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
316
+ "model.layers.27.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
317
+ "model.layers.28.input_layernorm.weight": "model-00012-of-00031.safetensors",
318
+ "model.layers.28.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
319
+ "model.layers.28.mlp.gate_proj.weight": "model-00012-of-00031.safetensors",
320
+ "model.layers.28.mlp.up_proj.weight": "model-00012-of-00031.safetensors",
321
+ "model.layers.28.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
322
+ "model.layers.28.self_attn.k_norm.weight": "model-00012-of-00031.safetensors",
323
+ "model.layers.28.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
324
+ "model.layers.28.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
325
+ "model.layers.28.self_attn.q_norm.weight": "model-00012-of-00031.safetensors",
326
+ "model.layers.28.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
327
+ "model.layers.28.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
328
+ "model.layers.29.input_layernorm.weight": "model-00012-of-00031.safetensors",
329
+ "model.layers.29.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
330
+ "model.layers.29.mlp.gate_proj.weight": "model-00012-of-00031.safetensors",
331
+ "model.layers.29.mlp.up_proj.weight": "model-00012-of-00031.safetensors",
332
+ "model.layers.29.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
333
+ "model.layers.29.self_attn.k_norm.weight": "model-00012-of-00031.safetensors",
334
+ "model.layers.29.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
335
+ "model.layers.29.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
336
+ "model.layers.29.self_attn.q_norm.weight": "model-00012-of-00031.safetensors",
337
+ "model.layers.29.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
338
+ "model.layers.29.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
339
+ "model.layers.30.input_layernorm.weight": "model-00012-of-00031.safetensors",
340
+ "model.layers.30.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
341
+ "model.layers.30.mlp.gate_proj.weight": "model-00013-of-00031.safetensors",
342
+ "model.layers.30.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
343
+ "model.layers.30.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
344
+ "model.layers.30.self_attn.k_norm.weight": "model-00013-of-00031.safetensors",
345
+ "model.layers.30.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
346
+ "model.layers.30.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
347
+ "model.layers.30.self_attn.q_norm.weight": "model-00013-of-00031.safetensors",
348
+ "model.layers.30.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
349
+ "model.layers.30.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
350
+ "model.layers.31.input_layernorm.weight": "model-00013-of-00031.safetensors",
351
+ "model.layers.31.mlp.down_proj.weight": "model-00013-of-00031.safetensors",
352
+ "model.layers.31.mlp.gate_proj.weight": "model-00013-of-00031.safetensors",
353
+ "model.layers.31.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
354
+ "model.layers.31.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
355
+ "model.layers.31.self_attn.k_norm.weight": "model-00013-of-00031.safetensors",
356
+ "model.layers.31.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
357
+ "model.layers.31.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
358
+ "model.layers.31.self_attn.q_norm.weight": "model-00013-of-00031.safetensors",
359
+ "model.layers.31.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
360
+ "model.layers.31.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
361
+ "model.layers.32.input_layernorm.weight": "model-00013-of-00031.safetensors",
362
+ "model.layers.32.mlp.down_proj.weight": "model-00013-of-00031.safetensors",
363
+ "model.layers.32.mlp.gate_proj.weight": "model-00013-of-00031.safetensors",
364
+ "model.layers.32.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
365
+ "model.layers.32.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
366
+ "model.layers.32.self_attn.k_norm.weight": "model-00013-of-00031.safetensors",
367
+ "model.layers.32.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
368
+ "model.layers.32.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
369
+ "model.layers.32.self_attn.q_norm.weight": "model-00013-of-00031.safetensors",
370
+ "model.layers.32.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
371
+ "model.layers.32.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
372
+ "model.layers.33.input_layernorm.weight": "model-00013-of-00031.safetensors",
373
+ "model.layers.33.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
374
+ "model.layers.33.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
375
+ "model.layers.33.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
376
+ "model.layers.33.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
377
+ "model.layers.33.self_attn.k_norm.weight": "model-00014-of-00031.safetensors",
378
+ "model.layers.33.self_attn.k_proj.weight": "model-00014-of-00031.safetensors",
379
+ "model.layers.33.self_attn.o_proj.weight": "model-00014-of-00031.safetensors",
380
+ "model.layers.33.self_attn.q_norm.weight": "model-00014-of-00031.safetensors",
381
+ "model.layers.33.self_attn.q_proj.weight": "model-00014-of-00031.safetensors",
382
+ "model.layers.33.self_attn.v_proj.weight": "model-00014-of-00031.safetensors",
383
+ "model.layers.34.input_layernorm.weight": "model-00014-of-00031.safetensors",
384
+ "model.layers.34.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
385
+ "model.layers.34.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
386
+ "model.layers.34.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
387
+ "model.layers.34.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
388
+ "model.layers.34.self_attn.k_norm.weight": "model-00014-of-00031.safetensors",
389
+ "model.layers.34.self_attn.k_proj.weight": "model-00014-of-00031.safetensors",
390
+ "model.layers.34.self_attn.o_proj.weight": "model-00014-of-00031.safetensors",
391
+ "model.layers.34.self_attn.q_norm.weight": "model-00014-of-00031.safetensors",
392
+ "model.layers.34.self_attn.q_proj.weight": "model-00014-of-00031.safetensors",
393
+ "model.layers.34.self_attn.v_proj.weight": "model-00014-of-00031.safetensors",
394
+ "model.layers.35.input_layernorm.weight": "model-00014-of-00031.safetensors",
395
+ "model.layers.35.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
396
+ "model.layers.35.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
397
+ "model.layers.35.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
398
+ "model.layers.35.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
399
+ "model.layers.35.self_attn.k_norm.weight": "model-00014-of-00031.safetensors",
400
+ "model.layers.35.self_attn.k_proj.weight": "model-00014-of-00031.safetensors",
401
+ "model.layers.35.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
402
+ "model.layers.35.self_attn.q_norm.weight": "model-00015-of-00031.safetensors",
403
+ "model.layers.35.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
404
+ "model.layers.35.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
405
+ "model.layers.36.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
406
+ "model.layers.36.mlp.up_proj.weight": "model-00015-of-00031.safetensors",
407
+ "model.layers.36.post_attention_layernorm.weight": "model-00015-of-00031.safetensors",
408
+ "model.layers.36.self_attn.k_norm.weight": "model-00015-of-00031.safetensors",
409
+ "model.layers.36.self_attn.k_proj.weight": "model-00015-of-00031.safetensors",
410
+ "model.layers.36.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
411
+ "model.layers.36.self_attn.q_norm.weight": "model-00015-of-00031.safetensors",
412
+ "model.layers.36.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
413
+ "model.layers.36.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
414
+ "model.layers.36.input_layernorm.weight": "model-00015-of-00031.safetensors",
415
+ "model.layers.36.mlp.down_proj.weight": "model-00015-of-00031.safetensors",
416
+ "model.layers.37.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
417
+ "model.layers.37.mlp.up_proj.weight": "model-00015-of-00031.safetensors",
418
+ "model.layers.37.post_attention_layernorm.weight": "model-00015-of-00031.safetensors",
419
+ "model.layers.37.self_attn.k_norm.weight": "model-00015-of-00031.safetensors",
420
+ "model.layers.37.self_attn.k_proj.weight": "model-00015-of-00031.safetensors",
421
+ "model.layers.37.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
422
+ "model.layers.37.self_attn.q_norm.weight": "model-00015-of-00031.safetensors",
423
+ "model.layers.37.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
424
+ "model.layers.37.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
425
+ "model.layers.37.input_layernorm.weight": "model-00015-of-00031.safetensors",
426
+ "model.layers.37.mlp.down_proj.weight": "model-00015-of-00031.safetensors",
427
+ "model.layers.38.input_layernorm.weight": "model-00015-of-00031.safetensors",
428
+ "model.layers.38.mlp.down_proj.weight": "model-00015-of-00031.safetensors",
429
+ "model.layers.38.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
430
+ "model.layers.38.mlp.up_proj.weight": "model-00016-of-00031.safetensors",
431
+ "model.layers.38.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
432
+ "model.layers.38.self_attn.k_norm.weight": "model-00016-of-00031.safetensors",
433
+ "model.layers.38.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
434
+ "model.layers.38.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
435
+ "model.layers.38.self_attn.q_norm.weight": "model-00016-of-00031.safetensors",
436
+ "model.layers.38.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
437
+ "model.layers.38.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
438
+ "model.layers.39.input_layernorm.weight": "model-00016-of-00031.safetensors",
439
+ "model.layers.39.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
440
+ "model.layers.39.mlp.gate_proj.weight": "model-00016-of-00031.safetensors",
441
+ "model.layers.39.mlp.up_proj.weight": "model-00016-of-00031.safetensors",
442
+ "model.layers.39.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
443
+ "model.layers.39.self_attn.k_norm.weight": "model-00016-of-00031.safetensors",
444
+ "model.layers.39.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
445
+ "model.layers.39.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
446
+ "model.layers.39.self_attn.q_norm.weight": "model-00016-of-00031.safetensors",
447
+ "model.layers.39.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
448
+ "model.layers.39.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
449
+ "model.layers.40.input_layernorm.weight": "model-00016-of-00031.safetensors",
450
+ "model.layers.40.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
451
+ "model.layers.40.mlp.gate_proj.weight": "model-00016-of-00031.safetensors",
452
+ "model.layers.40.mlp.up_proj.weight": "model-00016-of-00031.safetensors",
453
+ "model.layers.40.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
454
+ "model.layers.40.self_attn.k_norm.weight": "model-00016-of-00031.safetensors",
455
+ "model.layers.40.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
456
+ "model.layers.40.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
457
+ "model.layers.40.self_attn.q_norm.weight": "model-00016-of-00031.safetensors",
458
+ "model.layers.40.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
459
+ "model.layers.40.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
460
+ "model.layers.41.input_layernorm.weight": "model-00016-of-00031.safetensors",
461
+ "model.layers.41.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
462
+ "model.layers.41.mlp.gate_proj.weight": "model-00017-of-00031.safetensors",
463
+ "model.layers.41.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
464
+ "model.layers.41.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
465
+ "model.layers.41.self_attn.k_norm.weight": "model-00017-of-00031.safetensors",
466
+ "model.layers.41.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
467
+ "model.layers.41.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
468
+ "model.layers.41.self_attn.q_norm.weight": "model-00017-of-00031.safetensors",
469
+ "model.layers.41.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
470
+ "model.layers.41.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
471
+ "model.layers.42.mlp.down_proj.weight": "model-00017-of-00031.safetensors",
472
+ "model.layers.42.mlp.gate_proj.weight": "model-00017-of-00031.safetensors",
473
+ "model.layers.42.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
474
+ "model.layers.42.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
475
+ "model.layers.42.self_attn.k_norm.weight": "model-00017-of-00031.safetensors",
476
+ "model.layers.42.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
477
+ "model.layers.42.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
478
+ "model.layers.42.self_attn.q_norm.weight": "model-00017-of-00031.safetensors",
479
+ "model.layers.42.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
480
+ "model.layers.42.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
481
+ "model.layers.42.input_layernorm.weight": "model-00017-of-00031.safetensors",
482
+ "model.layers.43.mlp.down_proj.weight": "model-00017-of-00031.safetensors",
483
+ "model.layers.43.mlp.gate_proj.weight": "model-00017-of-00031.safetensors",
484
+ "model.layers.43.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
485
+ "model.layers.43.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
486
+ "model.layers.43.self_attn.k_norm.weight": "model-00017-of-00031.safetensors",
487
+ "model.layers.43.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
488
+ "model.layers.43.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
489
+ "model.layers.43.self_attn.q_norm.weight": "model-00017-of-00031.safetensors",
490
+ "model.layers.43.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
491
+ "model.layers.43.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
492
+ "model.layers.43.input_layernorm.weight": "model-00017-of-00031.safetensors",
493
+ "model.layers.44.input_layernorm.weight": "model-00017-of-00031.safetensors",
494
+ "model.layers.44.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
495
+ "model.layers.44.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
496
+ "model.layers.44.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
497
+ "model.layers.44.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
498
+ "model.layers.44.self_attn.k_norm.weight": "model-00018-of-00031.safetensors",
499
+ "model.layers.44.self_attn.k_proj.weight": "model-00018-of-00031.safetensors",
500
+ "model.layers.44.self_attn.o_proj.weight": "model-00018-of-00031.safetensors",
501
+ "model.layers.44.self_attn.q_norm.weight": "model-00018-of-00031.safetensors",
502
+ "model.layers.44.self_attn.q_proj.weight": "model-00018-of-00031.safetensors",
503
+ "model.layers.44.self_attn.v_proj.weight": "model-00018-of-00031.safetensors",
504
+ "model.layers.45.input_layernorm.weight": "model-00018-of-00031.safetensors",
505
+ "model.layers.45.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
506
+ "model.layers.45.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
507
+ "model.layers.45.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
508
+ "model.layers.45.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
509
+ "model.layers.45.self_attn.k_norm.weight": "model-00018-of-00031.safetensors",
510
+ "model.layers.45.self_attn.k_proj.weight": "model-00018-of-00031.safetensors",
511
+ "model.layers.45.self_attn.o_proj.weight": "model-00018-of-00031.safetensors",
512
+ "model.layers.45.self_attn.q_norm.weight": "model-00018-of-00031.safetensors",
513
+ "model.layers.45.self_attn.q_proj.weight": "model-00018-of-00031.safetensors",
514
+ "model.layers.45.self_attn.v_proj.weight": "model-00018-of-00031.safetensors",
515
+ "model.layers.46.input_layernorm.weight": "model-00018-of-00031.safetensors",
516
+ "model.layers.46.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
517
+ "model.layers.46.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
518
+ "model.layers.46.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
519
+ "model.layers.46.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
520
+ "model.layers.46.self_attn.k_norm.weight": "model-00018-of-00031.safetensors",
521
+ "model.layers.46.self_attn.k_proj.weight": "model-00018-of-00031.safetensors",
522
+ "model.layers.46.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
523
+ "model.layers.46.self_attn.q_norm.weight": "model-00019-of-00031.safetensors",
524
+ "model.layers.46.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
525
+ "model.layers.46.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
526
+ "model.layers.47.input_layernorm.weight": "model-00019-of-00031.safetensors",
527
+ "model.layers.47.mlp.down_proj.weight": "model-00019-of-00031.safetensors",
528
+ "model.layers.47.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
529
+ "model.layers.47.mlp.up_proj.weight": "model-00019-of-00031.safetensors",
530
+ "model.layers.47.post_attention_layernorm.weight": "model-00019-of-00031.safetensors",
531
+ "model.layers.47.self_attn.k_norm.weight": "model-00019-of-00031.safetensors",
532
+ "model.layers.47.self_attn.k_proj.weight": "model-00019-of-00031.safetensors",
533
+ "model.layers.47.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
534
+ "model.layers.47.self_attn.q_norm.weight": "model-00019-of-00031.safetensors",
535
+ "model.layers.47.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
536
+ "model.layers.47.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
537
+ "model.layers.48.input_layernorm.weight": "model-00019-of-00031.safetensors",
538
+ "model.layers.48.mlp.down_proj.weight": "model-00019-of-00031.safetensors",
539
+ "model.layers.48.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
540
+ "model.layers.48.mlp.up_proj.weight": "model-00019-of-00031.safetensors",
541
+ "model.layers.48.post_attention_layernorm.weight": "model-00019-of-00031.safetensors",
542
+ "model.layers.48.self_attn.k_norm.weight": "model-00019-of-00031.safetensors",
543
+ "model.layers.48.self_attn.k_proj.weight": "model-00019-of-00031.safetensors",
544
+ "model.layers.48.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
545
+ "model.layers.48.self_attn.q_norm.weight": "model-00019-of-00031.safetensors",
546
+ "model.layers.48.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
547
+ "model.layers.48.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
548
+ "model.layers.49.input_layernorm.weight": "model-00019-of-00031.safetensors",
549
+ "model.layers.49.mlp.down_proj.weight": "model-00019-of-00031.safetensors",
550
+ "model.layers.49.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
551
+ "model.layers.49.mlp.up_proj.weight": "model-00020-of-00031.safetensors",
552
+ "model.layers.49.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
553
+ "model.layers.49.self_attn.k_norm.weight": "model-00020-of-00031.safetensors",
554
+ "model.layers.49.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
555
+ "model.layers.49.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
556
+ "model.layers.49.self_attn.q_norm.weight": "model-00020-of-00031.safetensors",
557
+ "model.layers.49.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
558
+ "model.layers.49.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
559
+ "model.layers.50.input_layernorm.weight": "model-00020-of-00031.safetensors",
560
+ "model.layers.50.mlp.down_proj.weight": "model-00020-of-00031.safetensors",
561
+ "model.layers.50.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
562
+ "model.layers.50.mlp.up_proj.weight": "model-00020-of-00031.safetensors",
563
+ "model.layers.50.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
564
+ "model.layers.50.self_attn.k_norm.weight": "model-00020-of-00031.safetensors",
565
+ "model.layers.50.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
566
+ "model.layers.50.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
567
+ "model.layers.50.self_attn.q_norm.weight": "model-00020-of-00031.safetensors",
568
+ "model.layers.50.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
569
+ "model.layers.50.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
570
+ "model.layers.51.input_layernorm.weight": "model-00020-of-00031.safetensors",
571
+ "model.layers.51.mlp.down_proj.weight": "model-00020-of-00031.safetensors",
572
+ "model.layers.51.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
573
+ "model.layers.51.mlp.up_proj.weight": "model-00020-of-00031.safetensors",
574
+ "model.layers.51.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
575
+ "model.layers.51.self_attn.k_norm.weight": "model-00020-of-00031.safetensors",
576
+ "model.layers.51.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
577
+ "model.layers.51.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
578
+ "model.layers.51.self_attn.q_norm.weight": "model-00020-of-00031.safetensors",
579
+ "model.layers.51.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
580
+ "model.layers.51.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
581
+ "model.layers.52.input_layernorm.weight": "model-00020-of-00031.safetensors",
582
+ "model.layers.52.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
583
+ "model.layers.52.self_attn.k_norm.weight": "model-00020-of-00031.safetensors",
584
+ "model.layers.52.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
585
+ "model.layers.52.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
586
+ "model.layers.52.self_attn.q_norm.weight": "model-00020-of-00031.safetensors",
587
+ "model.layers.52.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
588
+ "model.layers.52.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
589
+ "model.layers.52.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
590
+ "model.layers.52.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
591
+ "model.layers.52.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
592
+ "model.layers.53.input_layernorm.weight": "model-00021-of-00031.safetensors",
593
+ "model.layers.53.mlp.gate_proj.weight": "model-00021-of-00031.safetensors",
594
+ "model.layers.53.self_attn.k_norm.weight": "model-00021-of-00031.safetensors",
595
+ "model.layers.53.self_attn.k_proj.weight": "model-00021-of-00031.safetensors",
596
+ "model.layers.53.self_attn.o_proj.weight": "model-00021-of-00031.safetensors",
597
+ "model.layers.53.self_attn.q_norm.weight": "model-00021-of-00031.safetensors",
598
+ "model.layers.53.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
599
+ "model.layers.53.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
600
+ "model.layers.53.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
601
+ "model.layers.53.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
602
+ "model.layers.53.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
603
+ "model.layers.54.input_layernorm.weight": "model-00021-of-00031.safetensors",
604
+ "model.layers.54.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
605
+ "model.layers.54.mlp.gate_proj.weight": "model-00021-of-00031.safetensors",
606
+ "model.layers.54.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
607
+ "model.layers.54.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
608
+ "model.layers.54.self_attn.k_norm.weight": "model-00021-of-00031.safetensors",
609
+ "model.layers.54.self_attn.k_proj.weight": "model-00021-of-00031.safetensors",
610
+ "model.layers.54.self_attn.o_proj.weight": "model-00021-of-00031.safetensors",
611
+ "model.layers.54.self_attn.q_norm.weight": "model-00021-of-00031.safetensors",
612
+ "model.layers.54.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
613
+ "model.layers.54.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
614
+ "model.layers.55.input_layernorm.weight": "model-00021-of-00031.safetensors",
615
+ "model.layers.55.mlp.down_proj.weight": "model-00022-of-00031.safetensors",
616
+ "model.layers.55.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
617
+ "model.layers.55.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
618
+ "model.layers.55.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
619
+ "model.layers.55.self_attn.k_norm.weight": "model-00022-of-00031.safetensors",
620
+ "model.layers.55.self_attn.k_proj.weight": "model-00022-of-00031.safetensors",
621
+ "model.layers.55.self_attn.o_proj.weight": "model-00022-of-00031.safetensors",
622
+ "model.layers.55.self_attn.q_norm.weight": "model-00022-of-00031.safetensors",
623
+ "model.layers.55.self_attn.q_proj.weight": "model-00022-of-00031.safetensors",
624
+ "model.layers.55.self_attn.v_proj.weight": "model-00022-of-00031.safetensors",
625
+ "model.layers.56.input_layernorm.weight": "model-00022-of-00031.safetensors",
626
+ "model.layers.56.mlp.down_proj.weight": "model-00022-of-00031.safetensors",
627
+ "model.layers.56.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
628
+ "model.layers.56.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
629
+ "model.layers.56.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
630
+ "model.layers.56.self_attn.k_norm.weight": "model-00022-of-00031.safetensors",
631
+ "model.layers.56.self_attn.k_proj.weight": "model-00022-of-00031.safetensors",
632
+ "model.layers.56.self_attn.o_proj.weight": "model-00022-of-00031.safetensors",
633
+ "model.layers.56.self_attn.q_norm.weight": "model-00022-of-00031.safetensors",
634
+ "model.layers.56.self_attn.q_proj.weight": "model-00022-of-00031.safetensors",
635
+ "model.layers.56.self_attn.v_proj.weight": "model-00022-of-00031.safetensors",
636
+ "model.layers.57.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
637
+ "model.layers.57.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
638
+ "model.layers.57.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
639
+ "model.layers.57.self_attn.k_norm.weight": "model-00022-of-00031.safetensors",
640
+ "model.layers.57.self_attn.k_proj.weight": "model-00022-of-00031.safetensors",
641
+ "model.layers.57.self_attn.o_proj.weight": "model-00022-of-00031.safetensors",
642
+ "model.layers.57.self_attn.q_norm.weight": "model-00022-of-00031.safetensors",
643
+ "model.layers.57.self_attn.q_proj.weight": "model-00022-of-00031.safetensors",
644
+ "model.layers.57.self_attn.v_proj.weight": "model-00022-of-00031.safetensors",
645
+ "model.layers.57.input_layernorm.weight": "model-00022-of-00031.safetensors",
646
+ "model.layers.57.mlp.down_proj.weight": "model-00023-of-00031.safetensors",
647
+ "model.layers.58.input_layernorm.weight": "model-00023-of-00031.safetensors",
648
+ "model.layers.58.mlp.down_proj.weight": "model-00023-of-00031.safetensors",
649
+ "model.layers.58.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
650
+ "model.layers.58.mlp.up_proj.weight": "model-00023-of-00031.safetensors",
651
+ "model.layers.58.post_attention_layernorm.weight": "model-00023-of-00031.safetensors",
652
+ "model.layers.58.self_attn.k_norm.weight": "model-00023-of-00031.safetensors",
653
+ "model.layers.58.self_attn.k_proj.weight": "model-00023-of-00031.safetensors",
654
+ "model.layers.58.self_attn.o_proj.weight": "model-00023-of-00031.safetensors",
655
+ "model.layers.58.self_attn.q_norm.weight": "model-00023-of-00031.safetensors",
656
+ "model.layers.58.self_attn.q_proj.weight": "model-00023-of-00031.safetensors",
657
+ "model.layers.58.self_attn.v_proj.weight": "model-00023-of-00031.safetensors",
658
+ "model.layers.59.input_layernorm.weight": "model-00023-of-00031.safetensors",
659
+ "model.layers.59.mlp.down_proj.weight": "model-00023-of-00031.safetensors",
660
+ "model.layers.59.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
661
+ "model.layers.59.mlp.up_proj.weight": "model-00023-of-00031.safetensors",
662
+ "model.layers.59.post_attention_layernorm.weight": "model-00023-of-00031.safetensors",
663
+ "model.layers.59.self_attn.k_norm.weight": "model-00023-of-00031.safetensors",
664
+ "model.layers.59.self_attn.k_proj.weight": "model-00023-of-00031.safetensors",
665
+ "model.layers.59.self_attn.o_proj.weight": "model-00023-of-00031.safetensors",
666
+ "model.layers.59.self_attn.q_norm.weight": "model-00023-of-00031.safetensors",
667
+ "model.layers.59.self_attn.q_proj.weight": "model-00023-of-00031.safetensors",
668
+ "model.layers.59.self_attn.v_proj.weight": "model-00023-of-00031.safetensors",
669
+ "model.layers.60.input_layernorm.weight": "model-00023-of-00031.safetensors",
670
+ "model.layers.60.mlp.down_proj.weight": "model-00023-of-00031.safetensors",
671
+ "model.layers.60.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
672
+ "model.layers.60.mlp.up_proj.weight": "model-00024-of-00031.safetensors",
673
+ "model.layers.60.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
674
+ "model.layers.60.self_attn.k_norm.weight": "model-00024-of-00031.safetensors",
675
+ "model.layers.60.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
676
+ "model.layers.60.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
677
+ "model.layers.60.self_attn.q_norm.weight": "model-00024-of-00031.safetensors",
678
+ "model.layers.60.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
679
+ "model.layers.60.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
680
+ "model.layers.61.input_layernorm.weight": "model-00024-of-00031.safetensors",
681
+ "model.layers.61.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
682
+ "model.layers.61.mlp.gate_proj.weight": "model-00024-of-00031.safetensors",
683
+ "model.layers.61.mlp.up_proj.weight": "model-00024-of-00031.safetensors",
684
+ "model.layers.61.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
685
+ "model.layers.61.self_attn.k_norm.weight": "model-00024-of-00031.safetensors",
686
+ "model.layers.61.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
687
+ "model.layers.61.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
688
+ "model.layers.61.self_attn.q_norm.weight": "model-00024-of-00031.safetensors",
689
+ "model.layers.61.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
690
+ "model.layers.61.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
691
+ "model.layers.62.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
692
+ "model.layers.62.self_attn.q_norm.weight": "model-00024-of-00031.safetensors",
693
+ "model.layers.62.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
694
+ "model.layers.62.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
695
+ "model.layers.62.input_layernorm.weight": "model-00024-of-00031.safetensors",
696
+ "model.layers.62.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
697
+ "model.layers.62.mlp.gate_proj.weight": "model-00024-of-00031.safetensors",
698
+ "model.layers.62.mlp.up_proj.weight": "model-00024-of-00031.safetensors",
699
+ "model.layers.62.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
700
+ "model.layers.62.self_attn.k_norm.weight": "model-00024-of-00031.safetensors",
701
+ "model.layers.62.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
702
+ "model.layers.63.input_layernorm.weight": "model-00024-of-00031.safetensors",
703
+ "model.layers.63.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
704
+ "model.layers.63.mlp.gate_proj.weight": "model-00025-of-00031.safetensors",
705
+ "model.layers.63.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
706
+ "model.layers.63.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
707
+ "model.layers.63.self_attn.k_norm.weight": "model-00025-of-00031.safetensors",
708
+ "model.layers.63.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
709
+ "model.layers.63.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
710
+ "model.layers.63.self_attn.q_norm.weight": "model-00025-of-00031.safetensors",
711
+ "model.layers.63.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
712
+ "model.layers.63.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
713
+ "model.layers.64.input_layernorm.weight": "model-00025-of-00031.safetensors",
714
+ "model.layers.64.mlp.down_proj.weight": "model-00025-of-00031.safetensors",
715
+ "model.layers.64.mlp.gate_proj.weight": "model-00025-of-00031.safetensors",
716
+ "model.layers.64.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
717
+ "model.layers.64.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
718
+ "model.layers.64.self_attn.k_norm.weight": "model-00025-of-00031.safetensors",
719
+ "model.layers.64.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
720
+ "model.layers.64.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
721
+ "model.layers.64.self_attn.q_norm.weight": "model-00025-of-00031.safetensors",
722
+ "model.layers.64.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
723
+ "model.layers.64.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
724
+ "model.layers.65.input_layernorm.weight": "model-00025-of-00031.safetensors",
725
+ "model.layers.65.mlp.down_proj.weight": "model-00025-of-00031.safetensors",
726
+ "model.layers.65.mlp.gate_proj.weight": "model-00025-of-00031.safetensors",
727
+ "model.layers.65.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
728
+ "model.layers.65.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
729
+ "model.layers.65.self_attn.k_norm.weight": "model-00025-of-00031.safetensors",
730
+ "model.layers.65.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
731
+ "model.layers.65.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
732
+ "model.layers.65.self_attn.q_norm.weight": "model-00025-of-00031.safetensors",
733
+ "model.layers.65.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
734
+ "model.layers.65.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
735
+ "model.layers.66.input_layernorm.weight": "model-00025-of-00031.safetensors",
736
+ "model.layers.66.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
737
+ "model.layers.66.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
738
+ "model.layers.66.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
739
+ "model.layers.66.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
740
+ "model.layers.66.self_attn.k_norm.weight": "model-00026-of-00031.safetensors",
741
+ "model.layers.66.self_attn.k_proj.weight": "model-00026-of-00031.safetensors",
742
+ "model.layers.66.self_attn.o_proj.weight": "model-00026-of-00031.safetensors",
743
+ "model.layers.66.self_attn.q_norm.weight": "model-00026-of-00031.safetensors",
744
+ "model.layers.66.self_attn.q_proj.weight": "model-00026-of-00031.safetensors",
745
+ "model.layers.66.self_attn.v_proj.weight": "model-00026-of-00031.safetensors",
746
+ "model.layers.67.input_layernorm.weight": "model-00026-of-00031.safetensors",
747
+ "model.layers.67.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
748
+ "model.layers.67.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
749
+ "model.layers.67.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
750
+ "model.layers.67.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
751
+ "model.layers.67.self_attn.k_norm.weight": "model-00026-of-00031.safetensors",
752
+ "model.layers.67.self_attn.k_proj.weight": "model-00026-of-00031.safetensors",
753
+ "model.layers.67.self_attn.o_proj.weight": "model-00026-of-00031.safetensors",
754
+ "model.layers.67.self_attn.q_norm.weight": "model-00026-of-00031.safetensors",
755
+ "model.layers.67.self_attn.q_proj.weight": "model-00026-of-00031.safetensors",
756
+ "model.layers.67.self_attn.v_proj.weight": "model-00026-of-00031.safetensors",
757
+ "model.layers.68.input_layernorm.weight": "model-00026-of-00031.safetensors",
758
+ "model.layers.68.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
759
+ "model.layers.68.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
760
+ "model.layers.68.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
761
+ "model.layers.68.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
762
+ "model.layers.68.self_attn.k_norm.weight": "model-00026-of-00031.safetensors",
763
+ "model.layers.68.self_attn.k_proj.weight": "model-00026-of-00031.safetensors",
764
+ "model.layers.68.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
765
+ "model.layers.68.self_attn.q_norm.weight": "model-00027-of-00031.safetensors",
766
+ "model.layers.68.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
767
+ "model.layers.68.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
768
+ "model.layers.69.input_layernorm.weight": "model-00027-of-00031.safetensors",
769
+ "model.layers.69.mlp.down_proj.weight": "model-00027-of-00031.safetensors",
770
+ "model.layers.69.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
771
+ "model.layers.69.mlp.up_proj.weight": "model-00027-of-00031.safetensors",
772
+ "model.layers.69.post_attention_layernorm.weight": "model-00027-of-00031.safetensors",
773
+ "model.layers.69.self_attn.k_norm.weight": "model-00027-of-00031.safetensors",
774
+ "model.layers.69.self_attn.k_proj.weight": "model-00027-of-00031.safetensors",
775
+ "model.layers.69.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
776
+ "model.layers.69.self_attn.q_norm.weight": "model-00027-of-00031.safetensors",
777
+ "model.layers.69.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
778
+ "model.layers.69.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
779
+ "model.layers.70.input_layernorm.weight": "model-00027-of-00031.safetensors",
780
+ "model.layers.70.mlp.down_proj.weight": "model-00027-of-00031.safetensors",
781
+ "model.layers.70.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
782
+ "model.layers.70.mlp.up_proj.weight": "model-00027-of-00031.safetensors",
783
+ "model.layers.70.post_attention_layernorm.weight": "model-00027-of-00031.safetensors",
784
+ "model.layers.70.self_attn.k_norm.weight": "model-00027-of-00031.safetensors",
785
+ "model.layers.70.self_attn.k_proj.weight": "model-00027-of-00031.safetensors",
786
+ "model.layers.70.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
787
+ "model.layers.70.self_attn.q_norm.weight": "model-00027-of-00031.safetensors",
788
+ "model.layers.70.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
789
+ "model.layers.70.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
790
+ "model.layers.71.input_layernorm.weight": "model-00027-of-00031.safetensors",
791
+ "model.layers.71.mlp.down_proj.weight": "model-00027-of-00031.safetensors",
792
+ "model.layers.71.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
793
+ "model.layers.71.mlp.up_proj.weight": "model-00028-of-00031.safetensors",
794
+ "model.layers.71.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
795
+ "model.layers.71.self_attn.k_norm.weight": "model-00028-of-00031.safetensors",
796
+ "model.layers.71.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
797
+ "model.layers.71.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
798
+ "model.layers.71.self_attn.q_norm.weight": "model-00028-of-00031.safetensors",
799
+ "model.layers.71.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
800
+ "model.layers.71.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
801
+ "model.layers.72.input_layernorm.weight": "model-00028-of-00031.safetensors",
802
+ "model.layers.72.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
803
+ "model.layers.72.mlp.gate_proj.weight": "model-00028-of-00031.safetensors",
804
+ "model.layers.72.mlp.up_proj.weight": "model-00028-of-00031.safetensors",
805
+ "model.layers.72.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
806
+ "model.layers.72.self_attn.k_norm.weight": "model-00028-of-00031.safetensors",
807
+ "model.layers.72.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
808
+ "model.layers.72.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
809
+ "model.layers.72.self_attn.q_norm.weight": "model-00028-of-00031.safetensors",
810
+ "model.layers.72.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
811
+ "model.layers.72.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
812
+ "model.layers.73.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
813
+ "model.layers.73.self_attn.q_norm.weight": "model-00028-of-00031.safetensors",
814
+ "model.layers.73.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
815
+ "model.layers.73.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
816
+ "model.layers.73.input_layernorm.weight": "model-00028-of-00031.safetensors",
817
+ "model.layers.73.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
818
+ "model.layers.73.mlp.gate_proj.weight": "model-00028-of-00031.safetensors",
819
+ "model.layers.73.mlp.up_proj.weight": "model-00028-of-00031.safetensors",
820
+ "model.layers.73.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
821
+ "model.layers.73.self_attn.k_norm.weight": "model-00028-of-00031.safetensors",
822
+ "model.layers.73.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
823
+ "model.layers.74.input_layernorm.weight": "model-00028-of-00031.safetensors",
824
+ "model.layers.74.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
825
+ "model.layers.74.mlp.gate_proj.weight": "model-00029-of-00031.safetensors",
826
+ "model.layers.74.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
827
+ "model.layers.74.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
828
+ "model.layers.74.self_attn.k_norm.weight": "model-00029-of-00031.safetensors",
829
+ "model.layers.74.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
830
+ "model.layers.74.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
831
+ "model.layers.74.self_attn.q_norm.weight": "model-00029-of-00031.safetensors",
832
+ "model.layers.74.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
833
+ "model.layers.74.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
834
+ "model.layers.75.input_layernorm.weight": "model-00029-of-00031.safetensors",
835
+ "model.layers.75.mlp.down_proj.weight": "model-00029-of-00031.safetensors",
836
+ "model.layers.75.mlp.gate_proj.weight": "model-00029-of-00031.safetensors",
837
+ "model.layers.75.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
838
+ "model.layers.75.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
839
+ "model.layers.75.self_attn.k_norm.weight": "model-00029-of-00031.safetensors",
840
+ "model.layers.75.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
841
+ "model.layers.75.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
842
+ "model.layers.75.self_attn.q_norm.weight": "model-00029-of-00031.safetensors",
843
+ "model.layers.75.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
844
+ "model.layers.75.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
845
+ "model.layers.76.input_layernorm.weight": "model-00029-of-00031.safetensors",
846
+ "model.layers.76.mlp.down_proj.weight": "model-00029-of-00031.safetensors",
847
+ "model.layers.76.mlp.gate_proj.weight": "model-00029-of-00031.safetensors",
848
+ "model.layers.76.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
849
+ "model.layers.76.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
850
+ "model.layers.76.self_attn.k_norm.weight": "model-00029-of-00031.safetensors",
851
+ "model.layers.76.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
852
+ "model.layers.76.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
853
+ "model.layers.76.self_attn.q_norm.weight": "model-00029-of-00031.safetensors",
854
+ "model.layers.76.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
855
+ "model.layers.76.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
856
+ "model.layers.77.input_layernorm.weight": "model-00029-of-00031.safetensors",
857
+ "model.layers.77.mlp.down_proj.weight": "model-00030-of-00031.safetensors",
858
+ "model.layers.77.mlp.gate_proj.weight": "model-00030-of-00031.safetensors",
859
+ "model.layers.77.mlp.up_proj.weight": "model-00030-of-00031.safetensors",
860
+ "model.layers.77.post_attention_layernorm.weight": "model-00030-of-00031.safetensors",
861
+ "model.layers.77.self_attn.k_norm.weight": "model-00030-of-00031.safetensors",
862
+ "model.layers.77.self_attn.k_proj.weight": "model-00030-of-00031.safetensors",
863
+ "model.layers.77.self_attn.o_proj.weight": "model-00030-of-00031.safetensors",
864
+ "model.layers.77.self_attn.q_norm.weight": "model-00030-of-00031.safetensors",
865
+ "model.layers.77.self_attn.q_proj.weight": "model-00030-of-00031.safetensors",
866
+ "model.layers.77.self_attn.v_proj.weight": "model-00030-of-00031.safetensors",
867
+ "model.layers.78.input_layernorm.weight": "model-00030-of-00031.safetensors",
868
+ "model.layers.78.mlp.down_proj.weight": "model-00030-of-00031.safetensors",
869
+ "model.layers.78.mlp.gate_proj.weight": "model-00030-of-00031.safetensors",
870
+ "model.layers.78.mlp.up_proj.weight": "model-00030-of-00031.safetensors",
871
+ "model.layers.78.post_attention_layernorm.weight": "model-00030-of-00031.safetensors",
872
+ "model.layers.78.self_attn.k_norm.weight": "model-00030-of-00031.safetensors",
873
+ "model.layers.78.self_attn.k_proj.weight": "model-00030-of-00031.safetensors",
874
+ "model.layers.78.self_attn.o_proj.weight": "model-00030-of-00031.safetensors",
875
+ "model.layers.78.self_attn.q_norm.weight": "model-00030-of-00031.safetensors",
876
+ "model.layers.78.self_attn.q_proj.weight": "model-00030-of-00031.safetensors",
877
+ "model.layers.78.self_attn.v_proj.weight": "model-00030-of-00031.safetensors",
878
+ "model.layers.79.mlp.gate_proj.weight": "model-00030-of-00031.safetensors",
879
+ "model.layers.79.mlp.up_proj.weight": "model-00030-of-00031.safetensors",
880
+ "model.layers.79.post_attention_layernorm.weight": "model-00030-of-00031.safetensors",
881
+ "model.layers.79.self_attn.k_norm.weight": "model-00030-of-00031.safetensors",
882
+ "model.layers.79.self_attn.k_proj.weight": "model-00030-of-00031.safetensors",
883
+ "model.layers.79.self_attn.o_proj.weight": "model-00030-of-00031.safetensors",
884
+ "model.layers.79.self_attn.q_norm.weight": "model-00030-of-00031.safetensors",
885
+ "model.layers.79.self_attn.q_proj.weight": "model-00030-of-00031.safetensors",
886
+ "model.layers.79.self_attn.v_proj.weight": "model-00030-of-00031.safetensors",
887
+ "model.layers.79.input_layernorm.weight": "model-00030-of-00031.safetensors",
888
+ "model.layers.79.mlp.down_proj.weight": "model-00031-of-00031.safetensors"
889
+ }
890
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
stage2_metadata.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stage": "2-duplicate",
3
+ "source_model": "./Qwen3-32B-to-72B-Stage1",
4
+ "method": "Simple layer duplication",
5
+ "layer_mapping": {
6
+ "0-23": "0-23 (unchanged)",
7
+ "24-39": "24-55 (each duplicated once)",
8
+ "40-63": "56-79 (unchanged)"
9
+ },
10
+ "duplication_info": {
11
+ "method": "exact_copy",
12
+ "layers_duplicated": [
13
+ 24,
14
+ 25,
15
+ 26,
16
+ 27,
17
+ 28,
18
+ 29,
19
+ 30,
20
+ 31,
21
+ 32,
22
+ 33,
23
+ 34,
24
+ 35,
25
+ 36,
26
+ 37,
27
+ 38,
28
+ 39
29
+ ]
30
+ },
31
+ "final_layers": 80
32
+ }
stage2_v3.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Stage 2: Expand Qwen3 from 64 to 80 layers using simple duplication
4
+ Mapping:
5
+ - Layers 0-23 → 0-23 (unchanged)
6
+ - Layers 24-39 → 24-55 (each layer duplicated once)
7
+ - Layers 40-63 → 56-79 (unchanged)
8
+ """
9
+
10
+ import torch
11
+ import os
12
+ import json
13
+ from tqdm import tqdm
14
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
15
+ from safetensors.torch import load_file, save_file
16
+ import numpy as np
17
+ from collections import OrderedDict
18
+ import gc
19
+ import shutil
20
+
21
+ # Configuration
22
+ INPUT_DIR = "./Qwen3-32B-to-72B-Stage1" # Output from stage 1
23
+ OUTPUT_DIR = "./Qwen3-72B-DupeLayers"
24
+ TARGET_LAYERS = 80
25
+ SOURCE_LAYERS = 64
26
+
27
+ def load_model_sharted(model_path):
28
+ """Load model weights from sharted safetensors files."""
29
+ print("\n💩 Loading sharted weights...")
30
+
31
+ index_path = os.path.join(model_path, "model.safetensors.index.json")
32
+
33
+ if not os.path.exists(index_path):
34
+ raise FileNotFoundError(f"No index file found at {index_path}")
35
+
36
+ with open(index_path, 'r') as f:
37
+ index = json.load(f)
38
+
39
+ weight_map = index['weight_map']
40
+ unique_files = set(weight_map.values())
41
+
42
+ all_weights = {}
43
+ for file in tqdm(unique_files, desc="Loading sharts"):
44
+ file_path = os.path.join(model_path, file)
45
+ weights = load_file(file_path)
46
+ all_weights.update(weights)
47
+
48
+ return all_weights
49
+
50
+ def save_model_sharted(state_dict, output_dir, max_shart_size="5GB"):
51
+ """Save model in sharted safetensors format."""
52
+ print("\n💩 Sharting model weights...")
53
+
54
+ os.makedirs(output_dir, exist_ok=True)
55
+
56
+ # Convert max_shart_size to bytes
57
+ size_map = {'GB': 1e9, 'MB': 1e6}
58
+ for unit, multiplier in size_map.items():
59
+ if unit in max_shart_size:
60
+ max_bytes = int(float(max_shart_size.replace(unit, '')) * multiplier)
61
+ break
62
+
63
+ # Group weights into sharts
64
+ sharts = []
65
+ current_shart = {}
66
+ current_size = 0
67
+
68
+ for name, tensor in state_dict.items():
69
+ tensor_size = tensor.numel() * tensor.element_size()
70
+
71
+ if current_size + tensor_size > max_bytes and current_shart:
72
+ sharts.append(current_shart)
73
+ current_shart = {}
74
+ current_size = 0
75
+
76
+ current_shart[name] = tensor
77
+ current_size += tensor_size
78
+
79
+ if current_shart:
80
+ sharts.append(current_shart)
81
+
82
+ # Save sharts
83
+ weight_map = {}
84
+ for i, shart in enumerate(tqdm(sharts, desc="Saving sharts")):
85
+ shart_name = f"model-{i+1:05d}-of-{len(sharts):05d}.safetensors"
86
+ save_file(shart, os.path.join(output_dir, shart_name))
87
+
88
+ for name in shart:
89
+ weight_map[name] = shart_name
90
+
91
+ # Save index
92
+ index = {
93
+ "metadata": {"total_size": sum(t.numel() * t.element_size() for t in state_dict.values())},
94
+ "weight_map": weight_map
95
+ }
96
+
97
+ with open(os.path.join(output_dir, "model.safetensors.index.json"), 'w') as f:
98
+ json.dump(index, f, indent=2)
99
+
100
+ print(f"💩 Successfully sharted into {len(sharts)} files!")
101
+
102
+ def extract_layer_weights(weights, layer_idx):
103
+ """Extract all weights for a specific layer."""
104
+ layer_weights = OrderedDict()
105
+ prefix = f"model.layers.{layer_idx}."
106
+
107
+ for name, tensor in weights.items():
108
+ if name.startswith(prefix):
109
+ # Remove the layer prefix to get the component name
110
+ component_name = name[len(prefix):]
111
+ layer_weights[component_name] = tensor
112
+
113
+ return layer_weights
114
+
115
+ def create_layer_weights(layer_weights, new_layer_idx):
116
+ """Create weight dict with new layer index."""
117
+ result = OrderedDict()
118
+ prefix = f"model.layers.{new_layer_idx}."
119
+
120
+ for component_name, tensor in layer_weights.items():
121
+ full_name = prefix + component_name
122
+ result[full_name] = tensor.clone() # Clone to ensure independent copies
123
+
124
+ return result
125
+
126
+ def verify_architecture(model_path):
127
+ """Verify the model architecture matches expected Qwen3-72B dimensions."""
128
+ print("\n" + "="*60)
129
+ print("ARCHITECTURE VERIFICATION")
130
+ print("="*60)
131
+
132
+ print("\nLoading model for verification...")
133
+ model = AutoModelForCausalLM.from_pretrained(
134
+ model_path,
135
+ torch_dtype=torch.bfloat16,
136
+ device_map="cpu",
137
+ trust_remote_code=True
138
+ )
139
+
140
+ expected = {
141
+ "lm_head.weight": (151936, 8192),
142
+ "model.embed_tokens.weight": (151936, 8192),
143
+ "model.layers.0.input_layernorm.weight": (8192,),
144
+ "model.layers.0.mlp.down_proj.weight": (8192, 29568),
145
+ "model.layers.0.mlp.gate_proj.weight": (29568, 8192),
146
+ "model.layers.0.mlp.up_proj.weight": (29568, 8192),
147
+ "model.layers.0.post_attention_layernorm.weight": (8192,),
148
+ "model.layers.0.self_attn.k_norm.weight": (128,),
149
+ "model.layers.0.self_attn.k_proj.weight": (1024, 8192),
150
+ "model.layers.0.self_attn.o_proj.weight": (8192, 8192),
151
+ "model.layers.0.self_attn.q_norm.weight": (128,),
152
+ "model.layers.0.self_attn.q_proj.weight": (8192, 8192),
153
+ "model.layers.0.self_attn.v_proj.weight": (1024, 8192),
154
+ "model.norm.weight": (8192,),
155
+ }
156
+
157
+ all_correct = True
158
+
159
+ # Check specific layers including duplicated ones
160
+ check_layers = [0, 24, 25, 39, 40, 56, 79] # Original and duplicated layers
161
+
162
+ for layer_idx in check_layers:
163
+ print(f"\n📍 Checking layer {layer_idx}:")
164
+ for base_name, expected_shape in expected.items():
165
+ if "layers.0." in base_name:
166
+ name = base_name.replace("layers.0.", f"layers.{layer_idx}.")
167
+ param_dict = dict(model.named_parameters())
168
+ if name in param_dict:
169
+ actual_shape = tuple(param_dict[name].shape)
170
+ if actual_shape == expected_shape:
171
+ print(f" ✓ {name.split('.')[-1]}: {actual_shape}")
172
+ else:
173
+ print(f" ✗ {name}: {actual_shape} (expected {expected_shape})")
174
+ all_correct = False
175
+
176
+ num_layers = model.config.num_hidden_layers
177
+ print(f"\nTotal layers: {num_layers} (expected: 80)")
178
+
179
+ if all_correct and num_layers == 80:
180
+ print("\n✅ Architecture verification PASSED!")
181
+ else:
182
+ print("\n❌ Architecture verification FAILED!")
183
+
184
+ del model
185
+ torch.cuda.empty_cache()
186
+ return all_correct
187
+
188
+ def run_diagnostics(model_path):
189
+ """Run comprehensive diagnostics on the expanded model."""
190
+ print("\n" + "="*60)
191
+ print("COMPREHENSIVE DIAGNOSTICS")
192
+ print("="*60)
193
+
194
+ # Load model and tokenizer
195
+ print("\nLoading model for diagnostics...")
196
+ model = AutoModelForCausalLM.from_pretrained(
197
+ model_path,
198
+ torch_dtype=torch.bfloat16,
199
+ device_map="auto",
200
+ trust_remote_code=True
201
+ )
202
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
203
+
204
+ # Test generation quality
205
+ print("\n🧪 Generation Quality Tests:")
206
+ test_cases = [
207
+ ("The capital of France is", ["Paris"]),
208
+ ("2 + 2 =", ["4", "four"]),
209
+ ("The quick brown fox", ["jumps", "jumped", "lazy", "dog"]),
210
+ ("Hello, my name is", None),
211
+ ("Water boils at", ["100", "212", "degrees"]),
212
+ ("The Earth orbits the", ["Sun", "solar"]),
213
+ ("Machine learning is a type of", ["artificial intelligence", "AI"]),
214
+ ("Python is a", ["programming", "language", "snake"]),
215
+ ("The largest planet is", ["Jupiter"]),
216
+ ("DNA stands for", ["deoxyribonucleic", "acid"]),
217
+ # Additional tests
218
+ ("The derivative of x squared is", ["2x", "two"]),
219
+ ("Shakespeare wrote", ["plays", "Hamlet", "Romeo"]),
220
+ ("The speed of light is", ["299", "300", "fast"]),
221
+ ("Photosynthesis converts", ["light", "energy", "carbon"]),
222
+ ("The Pythagorean theorem states", ["a²", "squared", "hypotenuse"]),
223
+ ]
224
+
225
+ device = model.device
226
+ coherent_count = 0
227
+ total_tests = len(test_cases)
228
+
229
+ for prompt, expected in test_cases:
230
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
231
+
232
+ with torch.no_grad():
233
+ outputs = model.generate(
234
+ **inputs,
235
+ max_new_tokens=20,
236
+ do_sample=True,
237
+ temperature=0.7,
238
+ top_k=50,
239
+ top_p=0.95,
240
+ pad_token_id=tokenizer.pad_token_id,
241
+ )
242
+
243
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
244
+ generated_only = generated_text[len(prompt):].strip()
245
+
246
+ print(f"\n Prompt: '{prompt}'")
247
+ print(f" Generated: '{generated_only}'")
248
+
249
+ # Check coherence
250
+ is_coherent = True
251
+
252
+ # Check for repetition
253
+ words = generated_only.split()
254
+ if len(words) > 3:
255
+ if len(set(words)) < len(words) / 2:
256
+ print(" ⚠️ High repetition detected")
257
+ is_coherent = False
258
+
259
+ # Check for expected content
260
+ if expected and len(generated_only) > 0:
261
+ found = any(kw.lower() in generated_only.lower() for kw in expected)
262
+ if found:
263
+ print(" ✓ Contains expected content")
264
+ else:
265
+ print(" ⚠️ Missing expected keywords")
266
+ is_coherent = False
267
+
268
+ if is_coherent and len(generated_only.split()) >= 2:
269
+ coherent_count += 1
270
+
271
+ coherence_rate = (coherent_count / total_tests) * 100
272
+ print(f"\n📊 Overall coherence rate: {coherence_rate:.1f}%")
273
+
274
+ # Perplexity test
275
+ print("\n📈 Perplexity Test:")
276
+ test_texts = [
277
+ "The quick brown fox jumps over the lazy dog.",
278
+ "In the beginning was the Word, and the Word was with God.",
279
+ "To be or not to be, that is the question.",
280
+ "E equals m c squared is Einstein's famous equation.",
281
+ ]
282
+
283
+ perplexities = []
284
+ for test_text in test_texts:
285
+ inputs = tokenizer(test_text, return_tensors="pt").to(device)
286
+
287
+ with torch.no_grad():
288
+ outputs = model(**inputs, labels=inputs["input_ids"])
289
+ perplexity = torch.exp(outputs.loss).item()
290
+ perplexities.append(perplexity)
291
+
292
+ print(f" '{test_text[:30]}...': {perplexity:.2f}")
293
+
294
+ avg_perplexity = np.mean(perplexities)
295
+ print(f"\n Average perplexity: {avg_perplexity:.2f}")
296
+
297
+ if avg_perplexity > 100:
298
+ print(" ⚠️ Very high perplexity")
299
+ elif avg_perplexity > 50:
300
+ print(" ⚠️ Moderately high perplexity")
301
+ else:
302
+ print(" ✓ Reasonable perplexity")
303
+
304
+ # Test duplicate layer behavior
305
+ print("\n🔬 Duplicate Layer Analysis:")
306
+ print("Checking if duplicated layers maintain reasonable behavior...")
307
+
308
+ # Get activations from a few layers
309
+ test_input = "The meaning of life is"
310
+ inputs = tokenizer(test_input, return_tensors="pt").to(device)
311
+
312
+ activations = {}
313
+ hooks = []
314
+
315
+ def get_activation(name):
316
+ def hook(model, input, output):
317
+ activations[name] = output[0].detach()
318
+ return hook
319
+
320
+ # Register hooks for duplicate pairs
321
+ for layer_idx in [24, 25, 39, 40]: # Original and duplicate
322
+ hook = model.model.layers[layer_idx].register_forward_hook(
323
+ get_activation(f'layer_{layer_idx}')
324
+ )
325
+ hooks.append(hook)
326
+
327
+ with torch.no_grad():
328
+ _ = model(**inputs)
329
+
330
+ # Remove hooks
331
+ for hook in hooks:
332
+ hook.remove()
333
+
334
+ # Check similarity of duplicates
335
+ if len(activations) >= 4:
336
+ # Check 24 vs 25 (should be duplicates)
337
+ act_24 = activations['layer_24'].flatten()
338
+ act_25 = activations['layer_25'].flatten()
339
+ similarity_24_25 = torch.cosine_similarity(act_24.unsqueeze(0), act_25.unsqueeze(0)).item()
340
+
341
+ # Check 39 vs 40 (should be different - 40 is original layer 40, not duplicate)
342
+ act_39 = activations['layer_39'].flatten()
343
+ act_40 = activations['layer_40'].flatten()
344
+ similarity_39_40 = torch.cosine_similarity(act_39.unsqueeze(0), act_40.unsqueeze(0)).item()
345
+
346
+ print(f" Cosine similarity layer 24 vs 25 (duplicate): {similarity_24_25:.4f}")
347
+ print(f" Cosine similarity layer 39 vs 40 (different): {similarity_39_40:.4f}")
348
+
349
+ if similarity_24_25 > 0.95:
350
+ print(" ✓ Duplicate layers show expected high similarity")
351
+ else:
352
+ print(" ⚠️ Duplicate layers diverged more than expected")
353
+
354
+ # Weight statistics check
355
+ print("\n🔍 Weight Statistics (checking for anomalies):")
356
+ anomalies = 0
357
+
358
+ for name, param in model.named_parameters():
359
+ if torch.isnan(param).any():
360
+ print(f" ⚠️ {name}: Contains NaN!")
361
+ anomalies += 1
362
+ elif torch.isinf(param).any():
363
+ print(f" ⚠️ {name}: Contains Inf!")
364
+ anomalies += 1
365
+ elif param.std() < 1e-8:
366
+ print(f" ⚠️ {name}: Zero variance!")
367
+ anomalies += 1
368
+
369
+ if anomalies == 0:
370
+ print(" ✓ No anomalies detected in weights")
371
+
372
+ # Final summary
373
+ success = coherence_rate >= 60 and avg_perplexity < 100 and anomalies == 0
374
+
375
+ print("\n" + "="*60)
376
+ print("DIAGNOSTIC SUMMARY")
377
+ print("="*60)
378
+
379
+ if success:
380
+ print("✅ Model passed all diagnostics!")
381
+ print(" - Good coherence rate")
382
+ print(" - Reasonable perplexity")
383
+ print(" - No weight anomalies")
384
+ print(" - Duplicate layers functioning correctly")
385
+ else:
386
+ print("⚠️ Some issues detected:")
387
+ if coherence_rate < 60:
388
+ print(f" - Low coherence rate: {coherence_rate:.1f}%")
389
+ if avg_perplexity >= 100:
390
+ print(f" - High average perplexity: {avg_perplexity:.2f}")
391
+ if anomalies > 0:
392
+ print(f" - Weight anomalies: {anomalies}")
393
+
394
+ del model
395
+ torch.cuda.empty_cache()
396
+ return success
397
+
398
+ def main():
399
+ print("="*60)
400
+ print("Stage 2: Simple Layer Duplication")
401
+ print("64 layers → 80 layers")
402
+ print("="*60)
403
+
404
+ # Load weights from stage 1
405
+ print(f"\n📥 Loading model from: {INPUT_DIR}")
406
+ weights = load_model_sharted(INPUT_DIR)
407
+
408
+ print(f"\n📊 Loaded {len(weights)} tensors")
409
+
410
+ # Create new weight dictionary
411
+ new_weights = OrderedDict()
412
+
413
+ # Copy non-layer weights
414
+ print("\n📋 Copying non-layer weights...")
415
+ for name, tensor in weights.items():
416
+ if not name.startswith("model.layers."):
417
+ new_weights[name] = tensor.clone()
418
+
419
+ # Layer expansion with progress bar
420
+ print("\n🔄 Expanding layers with simple duplication...")
421
+ print(" Layers 0-23: Direct copy")
422
+ print(" Layers 24-39: Each layer duplicated once")
423
+ print(" Layers 40-63: Direct copy (shifted to 56-79)")
424
+
425
+ new_layer_idx = 0
426
+
427
+ with tqdm(total=TARGET_LAYERS, desc="Creating layers") as pbar:
428
+ # Copy layers 0-23 unchanged
429
+ for old_idx in range(24):
430
+ layer_weights = extract_layer_weights(weights, old_idx)
431
+ new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
432
+ new_layer_idx += 1
433
+ pbar.update(1)
434
+
435
+ # Duplicate layers 24-39
436
+ for old_idx in range(24, 40):
437
+ # Copy original layer
438
+ layer_weights = extract_layer_weights(weights, old_idx)
439
+ new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
440
+ new_layer_idx += 1
441
+ pbar.update(1)
442
+
443
+ # Duplicate the same layer
444
+ print(f"\n Duplicating layer {old_idx} → layer {new_layer_idx}")
445
+ new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
446
+ new_layer_idx += 1
447
+ pbar.update(1)
448
+
449
+ # Copy layers 40-63 to positions 56-79
450
+ for old_idx in range(40, 64):
451
+ layer_weights = extract_layer_weights(weights, old_idx)
452
+ new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
453
+ new_layer_idx += 1
454
+ pbar.update(1)
455
+
456
+ print(f"\n✓ Created {new_layer_idx} layers")
457
+
458
+ # Verify we have all layers
459
+ if new_layer_idx != TARGET_LAYERS:
460
+ print(f"\n❌ ERROR: Created {new_layer_idx} layers but expected {TARGET_LAYERS}")
461
+ print("Layer creation failed. Exiting.")
462
+ return False
463
+
464
+ # Update config
465
+ print("\n📝 Updating model configuration...")
466
+ config_path = os.path.join(INPUT_DIR, "config.json")
467
+ with open(config_path, 'r') as f:
468
+ config = json.load(f)
469
+
470
+ config['num_hidden_layers'] = TARGET_LAYERS
471
+
472
+ # Save everything
473
+ print(f"\n💾 Saving expanded model to: {OUTPUT_DIR}")
474
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
475
+
476
+ # Save config
477
+ with open(os.path.join(OUTPUT_DIR, "config.json"), 'w') as f:
478
+ json.dump(config, f, indent=2)
479
+
480
+ # Copy tokenizer files
481
+ tokenizer_files = [
482
+ 'tokenizer.json', 'tokenizer_config.json',
483
+ 'special_tokens_map.json', 'generation_config.json'
484
+ ]
485
+
486
+ for file in tokenizer_files:
487
+ src = os.path.join(INPUT_DIR, file)
488
+ dst = os.path.join(OUTPUT_DIR, file)
489
+ if os.path.exists(src):
490
+ shutil.copy(src, dst)
491
+
492
+ # Save weights in sharted format
493
+ save_model_sharted(new_weights, OUTPUT_DIR)
494
+
495
+ # Save metadata
496
+ metadata = {
497
+ "stage": "2-duplicate",
498
+ "source_model": INPUT_DIR,
499
+ "method": "Simple layer duplication",
500
+ "layer_mapping": {
501
+ "0-23": "0-23 (unchanged)",
502
+ "24-39": "24-55 (each duplicated once)",
503
+ "40-63": "56-79 (unchanged)"
504
+ },
505
+ "duplication_info": {
506
+ "method": "exact_copy",
507
+ "layers_duplicated": list(range(24, 40))
508
+ },
509
+ "final_layers": TARGET_LAYERS
510
+ }
511
+
512
+ with open(os.path.join(OUTPUT_DIR, "stage2_metadata.json"), 'w') as f:
513
+ json.dump(metadata, f, indent=2)
514
+
515
+ print("\n✅ Stage 2 duplication complete!")
516
+
517
+ # Quick verification
518
+ print("\n🔍 Quick verification:")
519
+ print(f" Total weights: {len(new_weights)}")
520
+
521
+ # Count layers
522
+ layer_count = 0
523
+ for name in new_weights.keys():
524
+ if name.startswith("model.layers.") and ".input_layernorm.weight" in name:
525
+ layer_count += 1
526
+
527
+ print(f" Layer count: {layer_count} (expected: {TARGET_LAYERS})")
528
+
529
+ # Check duplicate similarity
530
+ print("\n🔬 Checking layer duplication:")
531
+ test_component = "self_attn.q_proj.weight"
532
+
533
+ # Check first duplicate pair
534
+ if f"model.layers.24.{test_component}" in new_weights and f"model.layers.25.{test_component}" in new_weights:
535
+ layer24 = new_weights[f"model.layers.24.{test_component}"]
536
+ layer25 = new_weights[f"model.layers.25.{test_component}"]
537
+
538
+ # Should be identical
539
+ if torch.equal(layer24, layer25):
540
+ print(" ✓ Layer 24 and 25 are identical (as expected)")
541
+ else:
542
+ print(" ⚠️ Layer 24 and 25 differ (unexpected!)")
543
+
544
+ print(f"\n🎉 SUCCESS! Model expanded to {TARGET_LAYERS} layers.")
545
+ print(f"📁 Output saved to: {OUTPUT_DIR}")
546
+
547
+ # Run full diagnostics
548
+ arch_ok = verify_architecture(OUTPUT_DIR)
549
+ diag_ok = run_diagnostics(OUTPUT_DIR)
550
+
551
+ if arch_ok and diag_ok:
552
+ print("\n🎊 FINAL SUCCESS! Your Qwen3-72B-DupeLayers model is ready and verified!")
553
+ print("\n📐 Final architecture:")
554
+ print(" Hidden size: 8192")
555
+ print(" Intermediate size: 29568")
556
+ print(" Attention heads: 64")
557
+ print(" KV heads: 8")
558
+ print(" Layers: 80")
559
+ print(" Vocabulary: 151936")
560
+ print("\n💡 The model has passed all quality checks and is ready for use!")
561
+ else:
562
+ print("\n⚠️ Some verification issues detected. Please review the diagnostics above.")
563
+
564
+ return arch_ok and diag_ok
565
+
566
+ if __name__ == "__main__":
567
+ success = main()
568
+ exit(0 if success else 1)
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 131072,
236
+ "pad_token": "<|endoftext|>",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }