Upload folder using huggingface_hub
Browse files- L10A/config.json +31 -0
- L10A/final.pth +3 -0
- L10A/metrics.json +77 -0
- L11A/config.json +31 -0
- L11A/final.pth +3 -0
- L11A/metrics.json +77 -0
- L5A/config.json +31 -0
- L5A/final.pth +3 -0
- L5A/metrics.json +77 -0
- L6A/config.json +31 -0
- L6A/final.pth +3 -0
- L6A/metrics.json +77 -0
- L7A/config.json +31 -0
- L7A/final.pth +3 -0
- L7A/metrics.json +77 -0
- L8A/config.json +31 -0
- L8A/final.pth +3 -0
- L8A/metrics.json +77 -0
- L9A/config.json +31 -0
- L9A/final.pth +3 -0
- L9A/metrics.json +77 -0
L10A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 10,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 30.592103958129883,
|
17 |
+
"out": 2.438781261444092
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L10A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:094eee9c4592e95407e9f48b1069bf6c721dc6c22979423ec9d1700099912dc1
|
3 |
+
size 1614040466
|
L10A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0004476196,
|
3 |
+
"explained_variance": 0.68912,
|
4 |
+
"l1": 13.8,
|
5 |
+
"ground_truth_norm": 2.459,
|
6 |
+
"reconstructed_norm": 2.072,
|
7 |
+
"error_norm": 1.3116,
|
8 |
+
"sparsity/below 1e-5": 21759,
|
9 |
+
"sparsity/below 1e-6": 21495,
|
10 |
+
"positivity": 0.37479,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 49,
|
13 |
+
"1": 145,
|
14 |
+
"2": 259,
|
15 |
+
"3": 316,
|
16 |
+
"4": 384,
|
17 |
+
"5": 110,
|
18 |
+
"6": 168,
|
19 |
+
"7": 8,
|
20 |
+
"8": 391,
|
21 |
+
"9": 154,
|
22 |
+
"10": 264,
|
23 |
+
"11": 40,
|
24 |
+
"12": 263,
|
25 |
+
"13": 425,
|
26 |
+
"14": 173,
|
27 |
+
"15": 123,
|
28 |
+
"16": 143,
|
29 |
+
"17": 27,
|
30 |
+
"18": 324,
|
31 |
+
"19": 91,
|
32 |
+
"20": 100,
|
33 |
+
"21": 407,
|
34 |
+
"22": 269,
|
35 |
+
"23": 327,
|
36 |
+
"24": 190,
|
37 |
+
"25": 135,
|
38 |
+
"26": 113,
|
39 |
+
"27": 233,
|
40 |
+
"28": 202,
|
41 |
+
"29": 99,
|
42 |
+
"30": 95,
|
43 |
+
"31": 95,
|
44 |
+
"32": 245,
|
45 |
+
"33": 207,
|
46 |
+
"34": 240,
|
47 |
+
"35": 176,
|
48 |
+
"36": 39,
|
49 |
+
"37": 248,
|
50 |
+
"38": 266,
|
51 |
+
"39": 166,
|
52 |
+
"40": 352,
|
53 |
+
"41": 161,
|
54 |
+
"42": 184,
|
55 |
+
"43": 30,
|
56 |
+
"44": 171,
|
57 |
+
"45": 235,
|
58 |
+
"46": 263,
|
59 |
+
"47": 293,
|
60 |
+
"48": 150,
|
61 |
+
"49": 182,
|
62 |
+
"50": 168,
|
63 |
+
"51": 78,
|
64 |
+
"52": 334,
|
65 |
+
"53": 152,
|
66 |
+
"54": 29,
|
67 |
+
"55": 268,
|
68 |
+
"56": 255,
|
69 |
+
"57": 115,
|
70 |
+
"58": 203,
|
71 |
+
"59": 83,
|
72 |
+
"60": 312,
|
73 |
+
"61": 200,
|
74 |
+
"62": 124,
|
75 |
+
"63": 230
|
76 |
+
}
|
77 |
+
}
|
L11A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 11,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 30.33755874633789,
|
17 |
+
"out": 2.5108070373535156
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L11A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60ea504a3ae409da4785d240b6ecae94556544cb11117e0c4ae8eb04757b5b16
|
3 |
+
size 1614040466
|
L11A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0004975098,
|
3 |
+
"explained_variance": 0.67261,
|
4 |
+
"l1": 14.56,
|
5 |
+
"ground_truth_norm": 2.54,
|
6 |
+
"reconstructed_norm": 2.119,
|
7 |
+
"error_norm": 1.3917,
|
8 |
+
"sparsity/below 1e-5": 23567,
|
9 |
+
"sparsity/below 1e-6": 23293,
|
10 |
+
"positivity": 0.31964,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 54,
|
13 |
+
"1": 218,
|
14 |
+
"2": 32,
|
15 |
+
"3": 116,
|
16 |
+
"4": 226,
|
17 |
+
"5": 107,
|
18 |
+
"6": 113,
|
19 |
+
"7": 66,
|
20 |
+
"8": 117,
|
21 |
+
"9": 126,
|
22 |
+
"10": 218,
|
23 |
+
"11": 220,
|
24 |
+
"12": 138,
|
25 |
+
"13": 91,
|
26 |
+
"14": 369,
|
27 |
+
"15": 360,
|
28 |
+
"16": 102,
|
29 |
+
"17": 109,
|
30 |
+
"18": 187,
|
31 |
+
"19": 299,
|
32 |
+
"20": 81,
|
33 |
+
"21": 326,
|
34 |
+
"22": 207,
|
35 |
+
"23": 195,
|
36 |
+
"24": 177,
|
37 |
+
"25": 22,
|
38 |
+
"26": 277,
|
39 |
+
"27": 41,
|
40 |
+
"28": 132,
|
41 |
+
"29": 80,
|
42 |
+
"30": 224,
|
43 |
+
"31": 370,
|
44 |
+
"32": 88,
|
45 |
+
"33": 103,
|
46 |
+
"34": 172,
|
47 |
+
"35": 109,
|
48 |
+
"36": 311,
|
49 |
+
"37": 48,
|
50 |
+
"38": 173,
|
51 |
+
"39": 43,
|
52 |
+
"40": 212,
|
53 |
+
"41": 41,
|
54 |
+
"42": 271,
|
55 |
+
"43": 69,
|
56 |
+
"44": 371,
|
57 |
+
"45": 61,
|
58 |
+
"46": 221,
|
59 |
+
"47": 152,
|
60 |
+
"48": 54,
|
61 |
+
"49": 50,
|
62 |
+
"50": 160,
|
63 |
+
"51": 178,
|
64 |
+
"52": 298,
|
65 |
+
"53": 289,
|
66 |
+
"54": 371,
|
67 |
+
"55": 44,
|
68 |
+
"56": 29,
|
69 |
+
"57": 151,
|
70 |
+
"58": 73,
|
71 |
+
"59": 101,
|
72 |
+
"60": 54,
|
73 |
+
"61": 191,
|
74 |
+
"62": 165,
|
75 |
+
"63": 421
|
76 |
+
}
|
77 |
+
}
|
L5A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 5,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 26.44734764099121,
|
17 |
+
"out": 1.6114155054092407
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L5A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03bc0ab9a7d5b6729116001d53958d2cd20264e979fc505f9ef9b0ca997997c9
|
3 |
+
size 1614040466
|
L5A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0001057259,
|
3 |
+
"explained_variance": 0.80751,
|
4 |
+
"l1": 9.0,
|
5 |
+
"ground_truth_norm": 1.624,
|
6 |
+
"reconstructed_norm": 1.493,
|
7 |
+
"error_norm": 0.6327,
|
8 |
+
"sparsity/below 1e-5": 21411,
|
9 |
+
"sparsity/below 1e-6": 21180,
|
10 |
+
"positivity": 0.36496,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 122,
|
13 |
+
"1": 320,
|
14 |
+
"2": 352,
|
15 |
+
"3": 106,
|
16 |
+
"4": 241,
|
17 |
+
"5": 361,
|
18 |
+
"6": 204,
|
19 |
+
"7": 160,
|
20 |
+
"8": 68,
|
21 |
+
"9": 70,
|
22 |
+
"10": 225,
|
23 |
+
"11": 66,
|
24 |
+
"12": 109,
|
25 |
+
"13": 191,
|
26 |
+
"14": 337,
|
27 |
+
"15": 305,
|
28 |
+
"16": 251,
|
29 |
+
"17": 59,
|
30 |
+
"18": 287,
|
31 |
+
"19": 268,
|
32 |
+
"20": 111,
|
33 |
+
"21": 43,
|
34 |
+
"22": 97,
|
35 |
+
"23": 46,
|
36 |
+
"24": 244,
|
37 |
+
"25": 147,
|
38 |
+
"26": 47,
|
39 |
+
"27": 70,
|
40 |
+
"28": 50,
|
41 |
+
"29": 39,
|
42 |
+
"30": 285,
|
43 |
+
"31": 138,
|
44 |
+
"32": 292,
|
45 |
+
"33": 335,
|
46 |
+
"34": 353,
|
47 |
+
"35": 46,
|
48 |
+
"36": 64,
|
49 |
+
"37": 188,
|
50 |
+
"38": 26,
|
51 |
+
"39": 271,
|
52 |
+
"40": 81,
|
53 |
+
"41": 171,
|
54 |
+
"42": 358,
|
55 |
+
"43": 313,
|
56 |
+
"44": 182,
|
57 |
+
"45": 363,
|
58 |
+
"46": 316,
|
59 |
+
"47": 138,
|
60 |
+
"48": 216,
|
61 |
+
"49": 98,
|
62 |
+
"50": 177,
|
63 |
+
"51": 40,
|
64 |
+
"52": 308,
|
65 |
+
"53": 246,
|
66 |
+
"54": 152,
|
67 |
+
"55": 83,
|
68 |
+
"56": 306,
|
69 |
+
"57": 239,
|
70 |
+
"58": 44,
|
71 |
+
"59": 343,
|
72 |
+
"60": 28,
|
73 |
+
"61": 189,
|
74 |
+
"62": 297,
|
75 |
+
"63": 277
|
76 |
+
}
|
77 |
+
}
|
L6A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 6,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 25.502641677856445,
|
17 |
+
"out": 1.9176710844039917
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L6A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:621f8ec5dbb7dbd634184b176cabb3dcecf8292ac1d78036c549c45f471deca7
|
3 |
+
size 1614040466
|
L6A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0001995308,
|
3 |
+
"explained_variance": 0.75204,
|
4 |
+
"l1": 10.84,
|
5 |
+
"ground_truth_norm": 1.939,
|
6 |
+
"reconstructed_norm": 1.726,
|
7 |
+
"error_norm": 0.8747,
|
8 |
+
"sparsity/below 1e-5": 20505,
|
9 |
+
"sparsity/below 1e-6": 20331,
|
10 |
+
"positivity": 0.38525,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 181,
|
13 |
+
"1": 67,
|
14 |
+
"2": 66,
|
15 |
+
"3": 122,
|
16 |
+
"4": 144,
|
17 |
+
"5": 143,
|
18 |
+
"6": 396,
|
19 |
+
"7": 241,
|
20 |
+
"8": 379,
|
21 |
+
"9": 287,
|
22 |
+
"10": 177,
|
23 |
+
"11": 247,
|
24 |
+
"12": 112,
|
25 |
+
"13": 163,
|
26 |
+
"14": 190,
|
27 |
+
"15": 199,
|
28 |
+
"16": 252,
|
29 |
+
"17": 357,
|
30 |
+
"18": 140,
|
31 |
+
"19": 272,
|
32 |
+
"20": 202,
|
33 |
+
"21": 146,
|
34 |
+
"22": 121,
|
35 |
+
"23": 24,
|
36 |
+
"24": 301,
|
37 |
+
"25": 187,
|
38 |
+
"26": 194,
|
39 |
+
"27": 281,
|
40 |
+
"28": 40,
|
41 |
+
"29": 48,
|
42 |
+
"30": 197,
|
43 |
+
"31": 84,
|
44 |
+
"32": 379,
|
45 |
+
"33": 174,
|
46 |
+
"34": 73,
|
47 |
+
"35": 190,
|
48 |
+
"36": 225,
|
49 |
+
"37": 395,
|
50 |
+
"38": 289,
|
51 |
+
"39": 299,
|
52 |
+
"40": 339,
|
53 |
+
"41": 112,
|
54 |
+
"42": 67,
|
55 |
+
"43": 350,
|
56 |
+
"44": 219,
|
57 |
+
"45": 98,
|
58 |
+
"46": 202,
|
59 |
+
"47": 149,
|
60 |
+
"48": 176,
|
61 |
+
"49": 324,
|
62 |
+
"50": 330,
|
63 |
+
"51": 338,
|
64 |
+
"52": 301,
|
65 |
+
"53": 41,
|
66 |
+
"54": 121,
|
67 |
+
"55": 205,
|
68 |
+
"56": 53,
|
69 |
+
"57": 190,
|
70 |
+
"58": 115,
|
71 |
+
"59": 195,
|
72 |
+
"60": 142,
|
73 |
+
"61": 86,
|
74 |
+
"62": 257,
|
75 |
+
"63": 230
|
76 |
+
}
|
77 |
+
}
|
L7A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 7,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 25.808958053588867,
|
17 |
+
"out": 2.3595995903015137
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L7A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6de3fcf7114df285a40bf94da6ccba859b968950ec4872e641c53322adc8c50b
|
3 |
+
size 1614040466
|
L7A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0003076273,
|
3 |
+
"explained_variance": 0.75009,
|
4 |
+
"l1": 13.16,
|
5 |
+
"ground_truth_norm": 2.385,
|
6 |
+
"reconstructed_norm": 2.113,
|
7 |
+
"error_norm": 1.0933,
|
8 |
+
"sparsity/below 1e-5": 21840,
|
9 |
+
"sparsity/below 1e-6": 21729,
|
10 |
+
"positivity": 0.34482,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 52,
|
13 |
+
"1": 259,
|
14 |
+
"2": 94,
|
15 |
+
"3": 135,
|
16 |
+
"4": 109,
|
17 |
+
"5": 135,
|
18 |
+
"6": 198,
|
19 |
+
"7": 257,
|
20 |
+
"8": 121,
|
21 |
+
"9": 185,
|
22 |
+
"10": 377,
|
23 |
+
"11": 57,
|
24 |
+
"12": 218,
|
25 |
+
"13": 168,
|
26 |
+
"14": 228,
|
27 |
+
"15": 128,
|
28 |
+
"16": 216,
|
29 |
+
"17": 98,
|
30 |
+
"18": 232,
|
31 |
+
"19": 132,
|
32 |
+
"20": 324,
|
33 |
+
"21": 206,
|
34 |
+
"22": 194,
|
35 |
+
"23": 66,
|
36 |
+
"24": 275,
|
37 |
+
"25": 158,
|
38 |
+
"26": 160,
|
39 |
+
"27": 238,
|
40 |
+
"28": 238,
|
41 |
+
"29": 60,
|
42 |
+
"30": 167,
|
43 |
+
"31": 215,
|
44 |
+
"32": 291,
|
45 |
+
"33": 337,
|
46 |
+
"34": 124,
|
47 |
+
"35": 84,
|
48 |
+
"36": 140,
|
49 |
+
"37": 230,
|
50 |
+
"38": 100,
|
51 |
+
"39": 140,
|
52 |
+
"40": 262,
|
53 |
+
"41": 368,
|
54 |
+
"42": 92,
|
55 |
+
"43": 87,
|
56 |
+
"44": 287,
|
57 |
+
"45": 202,
|
58 |
+
"46": 182,
|
59 |
+
"47": 94,
|
60 |
+
"48": 362,
|
61 |
+
"49": 143,
|
62 |
+
"50": 156,
|
63 |
+
"51": 44,
|
64 |
+
"52": 97,
|
65 |
+
"53": 137,
|
66 |
+
"54": 168,
|
67 |
+
"55": 110,
|
68 |
+
"56": 75,
|
69 |
+
"57": 151,
|
70 |
+
"58": 41,
|
71 |
+
"59": 271,
|
72 |
+
"60": 160,
|
73 |
+
"61": 208,
|
74 |
+
"62": 112,
|
75 |
+
"63": 344
|
76 |
+
}
|
77 |
+
}
|
L8A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 8,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 28.46880340576172,
|
17 |
+
"out": 2.3348097801208496
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L8A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68d209ff696f96aea135ee110e147f143d4ceadb21040d1733f90cdb23fc13ab
|
3 |
+
size 1614040466
|
L8A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0003913084,
|
3 |
+
"explained_variance": 0.69221,
|
4 |
+
"l1": 13.56,
|
5 |
+
"ground_truth_norm": 2.347,
|
6 |
+
"reconstructed_norm": 1.988,
|
7 |
+
"error_norm": 1.2371,
|
8 |
+
"sparsity/below 1e-5": 23158,
|
9 |
+
"sparsity/below 1e-6": 23050,
|
10 |
+
"positivity": 0.30444,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 284,
|
13 |
+
"1": 136,
|
14 |
+
"2": 368,
|
15 |
+
"3": 27,
|
16 |
+
"4": 14,
|
17 |
+
"5": 211,
|
18 |
+
"6": 358,
|
19 |
+
"7": 97,
|
20 |
+
"8": 10,
|
21 |
+
"9": 250,
|
22 |
+
"10": 448,
|
23 |
+
"11": 145,
|
24 |
+
"12": 4,
|
25 |
+
"13": 21,
|
26 |
+
"14": 104,
|
27 |
+
"15": 44,
|
28 |
+
"16": 43,
|
29 |
+
"17": 104,
|
30 |
+
"18": 304,
|
31 |
+
"19": 207,
|
32 |
+
"20": 74,
|
33 |
+
"21": 51,
|
34 |
+
"22": 162,
|
35 |
+
"23": 33,
|
36 |
+
"24": 107,
|
37 |
+
"25": 52,
|
38 |
+
"26": 322,
|
39 |
+
"27": 148,
|
40 |
+
"28": 116,
|
41 |
+
"29": 248,
|
42 |
+
"30": 323,
|
43 |
+
"31": 203,
|
44 |
+
"32": 220,
|
45 |
+
"33": 146,
|
46 |
+
"34": 273,
|
47 |
+
"35": 357,
|
48 |
+
"36": 17,
|
49 |
+
"37": 312,
|
50 |
+
"38": 24,
|
51 |
+
"39": 104,
|
52 |
+
"40": 126,
|
53 |
+
"41": 11,
|
54 |
+
"42": 53,
|
55 |
+
"43": 27,
|
56 |
+
"44": 42,
|
57 |
+
"45": 214,
|
58 |
+
"46": 240,
|
59 |
+
"47": 110,
|
60 |
+
"48": 226,
|
61 |
+
"49": 156,
|
62 |
+
"50": 256,
|
63 |
+
"51": 234,
|
64 |
+
"52": 269,
|
65 |
+
"53": 40,
|
66 |
+
"54": 113,
|
67 |
+
"55": 41,
|
68 |
+
"56": 52,
|
69 |
+
"57": 112,
|
70 |
+
"58": 112,
|
71 |
+
"59": 319,
|
72 |
+
"60": 197,
|
73 |
+
"61": 299,
|
74 |
+
"62": 156,
|
75 |
+
"63": 100
|
76 |
+
}
|
77 |
+
}
|
L9A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 9,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 28.447616577148438,
|
17 |
+
"out": 2.536945104598999
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L9A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6780ce060cdb2d3c2b35e90009c10fcf451d009b0f2cb7cf98dae5dbf92cfedc
|
3 |
+
size 1614040466
|
L9A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0004761537,
|
3 |
+
"explained_variance": 0.69271,
|
4 |
+
"l1": 14.97,
|
5 |
+
"ground_truth_norm": 2.561,
|
6 |
+
"reconstructed_norm": 2.163,
|
7 |
+
"error_norm": 1.3627,
|
8 |
+
"sparsity/below 1e-5": 23308,
|
9 |
+
"sparsity/below 1e-6": 23088,
|
10 |
+
"positivity": 0.31,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 44,
|
13 |
+
"1": 102,
|
14 |
+
"2": 224,
|
15 |
+
"3": 101,
|
16 |
+
"4": 4,
|
17 |
+
"5": 348,
|
18 |
+
"6": 180,
|
19 |
+
"7": 21,
|
20 |
+
"8": 94,
|
21 |
+
"9": 57,
|
22 |
+
"10": 8,
|
23 |
+
"11": 295,
|
24 |
+
"12": 114,
|
25 |
+
"13": 187,
|
26 |
+
"14": 408,
|
27 |
+
"15": 79,
|
28 |
+
"16": 453,
|
29 |
+
"17": 95,
|
30 |
+
"18": 17,
|
31 |
+
"19": 198,
|
32 |
+
"20": 178,
|
33 |
+
"21": 115,
|
34 |
+
"22": 20,
|
35 |
+
"23": 73,
|
36 |
+
"24": 154,
|
37 |
+
"25": 192,
|
38 |
+
"26": 83,
|
39 |
+
"27": 174,
|
40 |
+
"28": 148,
|
41 |
+
"29": 214,
|
42 |
+
"30": 159,
|
43 |
+
"31": 222,
|
44 |
+
"32": 48,
|
45 |
+
"33": 193,
|
46 |
+
"34": 231,
|
47 |
+
"35": 418,
|
48 |
+
"36": 48,
|
49 |
+
"37": 51,
|
50 |
+
"38": 300,
|
51 |
+
"39": 426,
|
52 |
+
"40": 125,
|
53 |
+
"41": 193,
|
54 |
+
"42": 133,
|
55 |
+
"43": 252,
|
56 |
+
"44": 114,
|
57 |
+
"45": 198,
|
58 |
+
"46": 207,
|
59 |
+
"47": 302,
|
60 |
+
"48": 144,
|
61 |
+
"49": 38,
|
62 |
+
"50": 36,
|
63 |
+
"51": 87,
|
64 |
+
"52": 394,
|
65 |
+
"53": 121,
|
66 |
+
"54": 322,
|
67 |
+
"55": 14,
|
68 |
+
"56": 3,
|
69 |
+
"57": 147,
|
70 |
+
"58": 299,
|
71 |
+
"59": 110,
|
72 |
+
"60": 176,
|
73 |
+
"61": 30,
|
74 |
+
"62": 10,
|
75 |
+
"63": 227
|
76 |
+
}
|
77 |
+
}
|