Upload folder using huggingface_hub
Browse files- L12A/config.json +31 -0
- L12A/final.pth +3 -0
- L12A/metrics.json +77 -0
- L13A/config.json +31 -0
- L13A/final.pth +3 -0
- L13A/metrics.json +77 -0
- L14A/config.json +31 -0
- L14A/final.pth +3 -0
- L14A/metrics.json +77 -0
- L15A/config.json +31 -0
- L15A/final.pth +3 -0
- L15A/metrics.json +77 -0
- L16A/config.json +31 -0
- L16A/final.pth +3 -0
- L16A/metrics.json +77 -0
- L17A/config.json +31 -0
- L17A/final.pth +3 -0
- L17A/metrics.json +77 -0
- L18A/config.json +31 -0
- L18A/final.pth +3 -0
- L18A/metrics.json +77 -0
- L19A/config.json +31 -0
- L19A/final.pth +3 -0
- L19A/metrics.json +77 -0
- L20A/config.json +31 -0
- L20A/final.pth +3 -0
- L20A/metrics.json +77 -0
- L21A/config.json +31 -0
- L21A/final.pth +3 -0
- L21A/metrics.json +77 -0
- L22A/config.json +31 -0
- L22A/final.pth +3 -0
- L22A/metrics.json +77 -0
- L23A/config.json +31 -0
- L23A/final.pth +3 -0
- L23A/metrics.json +77 -0
L12A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 12,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 27.36377716064453,
|
17 |
+
"out": 2.912698745727539
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L12A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e94c9348a6c6e4382ce1f141a3be0fead61c0c81f073144900e0bac8692570be
|
3 |
+
size 1614040466
|
L12A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0006444874,
|
3 |
+
"explained_variance": 0.65362,
|
4 |
+
"l1": 16.72,
|
5 |
+
"ground_truth_norm": 2.933,
|
6 |
+
"reconstructed_norm": 2.46,
|
7 |
+
"error_norm": 1.5802,
|
8 |
+
"sparsity/below 1e-5": 24648,
|
9 |
+
"sparsity/below 1e-6": 24548,
|
10 |
+
"positivity": 0.2558,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 113,
|
13 |
+
"1": 3,
|
14 |
+
"2": 28,
|
15 |
+
"3": 61,
|
16 |
+
"4": 361,
|
17 |
+
"5": 79,
|
18 |
+
"6": 180,
|
19 |
+
"7": 20,
|
20 |
+
"8": 215,
|
21 |
+
"9": 12,
|
22 |
+
"10": 97,
|
23 |
+
"11": 136,
|
24 |
+
"12": 146,
|
25 |
+
"13": 89,
|
26 |
+
"14": 13,
|
27 |
+
"15": 30,
|
28 |
+
"16": 5,
|
29 |
+
"17": 20,
|
30 |
+
"18": 183,
|
31 |
+
"19": 7,
|
32 |
+
"20": 191,
|
33 |
+
"21": 216,
|
34 |
+
"22": 2,
|
35 |
+
"23": 21,
|
36 |
+
"24": 166,
|
37 |
+
"25": 104,
|
38 |
+
"26": 334,
|
39 |
+
"27": 418,
|
40 |
+
"28": 368,
|
41 |
+
"29": 35,
|
42 |
+
"30": 306,
|
43 |
+
"31": 383,
|
44 |
+
"32": 166,
|
45 |
+
"33": 394,
|
46 |
+
"34": 129,
|
47 |
+
"35": 64,
|
48 |
+
"36": 418,
|
49 |
+
"37": 29,
|
50 |
+
"38": 10,
|
51 |
+
"39": 130,
|
52 |
+
"40": 122,
|
53 |
+
"41": 175,
|
54 |
+
"42": 8,
|
55 |
+
"43": 124,
|
56 |
+
"44": 15,
|
57 |
+
"45": 20,
|
58 |
+
"46": 13,
|
59 |
+
"47": 198,
|
60 |
+
"48": 308,
|
61 |
+
"49": 141,
|
62 |
+
"50": 27,
|
63 |
+
"51": 166,
|
64 |
+
"52": 85,
|
65 |
+
"53": 8,
|
66 |
+
"54": 247,
|
67 |
+
"55": 142,
|
68 |
+
"56": 31,
|
69 |
+
"57": 18,
|
70 |
+
"58": 47,
|
71 |
+
"59": 80,
|
72 |
+
"60": 277,
|
73 |
+
"61": 73,
|
74 |
+
"62": 157,
|
75 |
+
"63": 218
|
76 |
+
}
|
77 |
+
}
|
L13A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 13,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 31.526508331298828,
|
17 |
+
"out": 3.1382012367248535
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L13A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8bf691e58b635c9c5a20488926004b6c872f6a4a52693fb278b2f00f5f96a80
|
3 |
+
size 1614040466
|
L13A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0007835636,
|
3 |
+
"explained_variance": 0.68137,
|
4 |
+
"l1": 18.08,
|
5 |
+
"ground_truth_norm": 3.16,
|
6 |
+
"reconstructed_norm": 2.63,
|
7 |
+
"error_norm": 1.7362,
|
8 |
+
"sparsity/below 1e-5": 25078,
|
9 |
+
"sparsity/below 1e-6": 25001,
|
10 |
+
"positivity": 0.24667,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 180,
|
13 |
+
"1": 68,
|
14 |
+
"2": 279,
|
15 |
+
"3": 133,
|
16 |
+
"4": 12,
|
17 |
+
"5": 123,
|
18 |
+
"6": 120,
|
19 |
+
"7": 55,
|
20 |
+
"8": 189,
|
21 |
+
"9": 117,
|
22 |
+
"10": 410,
|
23 |
+
"11": 41,
|
24 |
+
"12": 171,
|
25 |
+
"13": 12,
|
26 |
+
"14": 35,
|
27 |
+
"15": 255,
|
28 |
+
"16": 40,
|
29 |
+
"17": 3,
|
30 |
+
"18": 9,
|
31 |
+
"19": 38,
|
32 |
+
"20": 207,
|
33 |
+
"21": 354,
|
34 |
+
"22": 223,
|
35 |
+
"23": 450,
|
36 |
+
"24": 27,
|
37 |
+
"25": 31,
|
38 |
+
"26": 65,
|
39 |
+
"27": 21,
|
40 |
+
"28": 217,
|
41 |
+
"29": 63,
|
42 |
+
"30": 278,
|
43 |
+
"31": 25,
|
44 |
+
"32": 197,
|
45 |
+
"33": 368,
|
46 |
+
"34": 101,
|
47 |
+
"35": 64,
|
48 |
+
"36": 310,
|
49 |
+
"37": 32,
|
50 |
+
"38": 174,
|
51 |
+
"39": 286,
|
52 |
+
"40": 109,
|
53 |
+
"41": 101,
|
54 |
+
"42": 7,
|
55 |
+
"43": 147,
|
56 |
+
"44": 26,
|
57 |
+
"45": 38,
|
58 |
+
"46": 55,
|
59 |
+
"47": 7,
|
60 |
+
"48": 351,
|
61 |
+
"49": 93,
|
62 |
+
"50": 10,
|
63 |
+
"51": 14,
|
64 |
+
"52": 3,
|
65 |
+
"53": 179,
|
66 |
+
"54": 129,
|
67 |
+
"55": 182,
|
68 |
+
"56": 23,
|
69 |
+
"57": 65,
|
70 |
+
"58": 37,
|
71 |
+
"59": 180,
|
72 |
+
"60": 82,
|
73 |
+
"61": 117,
|
74 |
+
"62": 37,
|
75 |
+
"63": 308
|
76 |
+
}
|
77 |
+
}
|
L14A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 14,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 32.4897575378418,
|
17 |
+
"out": 3.191192865371704
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L14A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57feb9549ec962084f9978ff99b2c3ec6ec4c5271e6bd270a00280a5442ff483
|
3 |
+
size 1614040466
|
L14A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0007458544,
|
3 |
+
"explained_variance": 0.71227,
|
4 |
+
"l1": 20.76,
|
5 |
+
"ground_truth_norm": 3.223,
|
6 |
+
"reconstructed_norm": 2.733,
|
7 |
+
"error_norm": 1.6916,
|
8 |
+
"sparsity/below 1e-5": 26050,
|
9 |
+
"sparsity/below 1e-6": 26003,
|
10 |
+
"positivity": 0.20981,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 17,
|
13 |
+
"1": 107,
|
14 |
+
"2": 53,
|
15 |
+
"3": 146,
|
16 |
+
"4": 3,
|
17 |
+
"5": 114,
|
18 |
+
"6": 19,
|
19 |
+
"7": 20,
|
20 |
+
"8": 92,
|
21 |
+
"9": 123,
|
22 |
+
"10": 178,
|
23 |
+
"11": 182,
|
24 |
+
"12": 230,
|
25 |
+
"13": 181,
|
26 |
+
"14": 45,
|
27 |
+
"15": 64,
|
28 |
+
"16": 310,
|
29 |
+
"17": 9,
|
30 |
+
"18": 15,
|
31 |
+
"19": 27,
|
32 |
+
"20": 154,
|
33 |
+
"21": 94,
|
34 |
+
"22": 117,
|
35 |
+
"23": 164,
|
36 |
+
"24": 189,
|
37 |
+
"25": 64,
|
38 |
+
"26": 190,
|
39 |
+
"27": 90,
|
40 |
+
"28": 191,
|
41 |
+
"29": 178,
|
42 |
+
"30": 47,
|
43 |
+
"31": 151,
|
44 |
+
"32": 5,
|
45 |
+
"33": 264,
|
46 |
+
"34": 82,
|
47 |
+
"35": 121,
|
48 |
+
"36": 86,
|
49 |
+
"37": 17,
|
50 |
+
"38": 55,
|
51 |
+
"39": 8,
|
52 |
+
"40": 410,
|
53 |
+
"41": 107,
|
54 |
+
"42": 51,
|
55 |
+
"43": 44,
|
56 |
+
"44": 233,
|
57 |
+
"45": 26,
|
58 |
+
"46": 98,
|
59 |
+
"47": 34,
|
60 |
+
"48": 95,
|
61 |
+
"49": 99,
|
62 |
+
"50": 162,
|
63 |
+
"51": 150,
|
64 |
+
"52": 298,
|
65 |
+
"53": 10,
|
66 |
+
"54": 85,
|
67 |
+
"55": 20,
|
68 |
+
"56": 69,
|
69 |
+
"57": 235,
|
70 |
+
"58": 38,
|
71 |
+
"59": 187,
|
72 |
+
"60": 18,
|
73 |
+
"61": 107,
|
74 |
+
"62": 66,
|
75 |
+
"63": 31
|
76 |
+
}
|
77 |
+
}
|
L15A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 15,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 30.77876853942871,
|
17 |
+
"out": 3.105498790740967
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L15A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82ba1571ed03bc6e7ae938b992bda3eba7a6bf0f93bf2502ff77c7d9ad326e11
|
3 |
+
size 1614040466
|
L15A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0007111974,
|
3 |
+
"explained_variance": 0.70865,
|
4 |
+
"l1": 18.94,
|
5 |
+
"ground_truth_norm": 3.159,
|
6 |
+
"reconstructed_norm": 2.682,
|
7 |
+
"error_norm": 1.6552,
|
8 |
+
"sparsity/below 1e-5": 22472,
|
9 |
+
"sparsity/below 1e-6": 22379,
|
10 |
+
"positivity": 0.32211,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 302,
|
13 |
+
"1": 81,
|
14 |
+
"2": 361,
|
15 |
+
"3": 286,
|
16 |
+
"4": 184,
|
17 |
+
"5": 178,
|
18 |
+
"6": 258,
|
19 |
+
"7": 84,
|
20 |
+
"8": 88,
|
21 |
+
"9": 419,
|
22 |
+
"10": 113,
|
23 |
+
"11": 161,
|
24 |
+
"12": 365,
|
25 |
+
"13": 175,
|
26 |
+
"14": 327,
|
27 |
+
"15": 154,
|
28 |
+
"16": 355,
|
29 |
+
"17": 289,
|
30 |
+
"18": 244,
|
31 |
+
"19": 50,
|
32 |
+
"20": 93,
|
33 |
+
"21": 39,
|
34 |
+
"22": 144,
|
35 |
+
"23": 16,
|
36 |
+
"24": 194,
|
37 |
+
"25": 101,
|
38 |
+
"26": 122,
|
39 |
+
"27": 43,
|
40 |
+
"28": 35,
|
41 |
+
"29": 59,
|
42 |
+
"30": 123,
|
43 |
+
"31": 292,
|
44 |
+
"32": 281,
|
45 |
+
"33": 291,
|
46 |
+
"34": 107,
|
47 |
+
"35": 380,
|
48 |
+
"36": 284,
|
49 |
+
"37": 74,
|
50 |
+
"38": 108,
|
51 |
+
"39": 12,
|
52 |
+
"40": 212,
|
53 |
+
"41": 272,
|
54 |
+
"42": 52,
|
55 |
+
"43": 200,
|
56 |
+
"44": 71,
|
57 |
+
"45": 98,
|
58 |
+
"46": 72,
|
59 |
+
"47": 2,
|
60 |
+
"48": 198,
|
61 |
+
"49": 86,
|
62 |
+
"50": 208,
|
63 |
+
"51": 69,
|
64 |
+
"52": 145,
|
65 |
+
"53": 141,
|
66 |
+
"54": 10,
|
67 |
+
"55": 149,
|
68 |
+
"56": 82,
|
69 |
+
"57": 269,
|
70 |
+
"58": 126,
|
71 |
+
"59": 26,
|
72 |
+
"60": 339,
|
73 |
+
"61": 151,
|
74 |
+
"62": 121,
|
75 |
+
"63": 184
|
76 |
+
}
|
77 |
+
}
|
L16A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 16,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 31.657766342163086,
|
17 |
+
"out": 3.292898416519165
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L16A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9ca2047752006f4fed44dc1272e1f8b9a4e12e6750cc1b2efae923ab53ad897e
|
3 |
+
size 1614040466
|
L16A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0006912273,
|
3 |
+
"explained_variance": 0.73999,
|
4 |
+
"l1": 20.49,
|
5 |
+
"ground_truth_norm": 3.339,
|
6 |
+
"reconstructed_norm": 2.904,
|
7 |
+
"error_norm": 1.6263,
|
8 |
+
"sparsity/below 1e-5": 22885,
|
9 |
+
"sparsity/below 1e-6": 22814,
|
10 |
+
"positivity": 0.3063,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 216,
|
13 |
+
"1": 304,
|
14 |
+
"2": 7,
|
15 |
+
"3": 287,
|
16 |
+
"4": 379,
|
17 |
+
"5": 244,
|
18 |
+
"6": 95,
|
19 |
+
"7": 110,
|
20 |
+
"8": 121,
|
21 |
+
"9": 75,
|
22 |
+
"10": 3,
|
23 |
+
"11": 290,
|
24 |
+
"12": 72,
|
25 |
+
"13": 57,
|
26 |
+
"14": 104,
|
27 |
+
"15": 140,
|
28 |
+
"16": 29,
|
29 |
+
"17": 125,
|
30 |
+
"18": 257,
|
31 |
+
"19": 305,
|
32 |
+
"20": 94,
|
33 |
+
"21": 296,
|
34 |
+
"22": 154,
|
35 |
+
"23": 92,
|
36 |
+
"24": 179,
|
37 |
+
"25": 278,
|
38 |
+
"26": 70,
|
39 |
+
"27": 99,
|
40 |
+
"28": 176,
|
41 |
+
"29": 146,
|
42 |
+
"30": 56,
|
43 |
+
"31": 57,
|
44 |
+
"32": 6,
|
45 |
+
"33": 269,
|
46 |
+
"34": 194,
|
47 |
+
"35": 172,
|
48 |
+
"36": 170,
|
49 |
+
"37": 150,
|
50 |
+
"38": 3,
|
51 |
+
"39": 46,
|
52 |
+
"40": 83,
|
53 |
+
"41": 77,
|
54 |
+
"42": 107,
|
55 |
+
"43": 11,
|
56 |
+
"44": 2,
|
57 |
+
"45": 208,
|
58 |
+
"46": 352,
|
59 |
+
"47": 16,
|
60 |
+
"48": 364,
|
61 |
+
"49": 88,
|
62 |
+
"50": 247,
|
63 |
+
"51": 54,
|
64 |
+
"52": 141,
|
65 |
+
"53": 182,
|
66 |
+
"54": 358,
|
67 |
+
"55": 305,
|
68 |
+
"56": 38,
|
69 |
+
"57": 140,
|
70 |
+
"58": 186,
|
71 |
+
"59": 71,
|
72 |
+
"60": 319,
|
73 |
+
"61": 375,
|
74 |
+
"62": 90,
|
75 |
+
"63": 296
|
76 |
+
}
|
77 |
+
}
|
L17A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 17,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 31.074840545654297,
|
17 |
+
"out": 3.057485818862915
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L17A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60c31a6ebb759b91b11aa70f4b80cb7b3135f745783599134d3df77b72b64017
|
3 |
+
size 1614040466
|
L17A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0005549974,
|
3 |
+
"explained_variance": 0.75869,
|
4 |
+
"l1": 19.07,
|
5 |
+
"ground_truth_norm": 3.08,
|
6 |
+
"reconstructed_norm": 2.707,
|
7 |
+
"error_norm": 1.4556,
|
8 |
+
"sparsity/below 1e-5": 20583,
|
9 |
+
"sparsity/below 1e-6": 20264,
|
10 |
+
"positivity": 0.39352,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 232,
|
13 |
+
"1": 170,
|
14 |
+
"2": 219,
|
15 |
+
"3": 309,
|
16 |
+
"4": 196,
|
17 |
+
"5": 345,
|
18 |
+
"6": 351,
|
19 |
+
"7": 127,
|
20 |
+
"8": 174,
|
21 |
+
"9": 63,
|
22 |
+
"10": 150,
|
23 |
+
"11": 287,
|
24 |
+
"12": 383,
|
25 |
+
"13": 268,
|
26 |
+
"14": 79,
|
27 |
+
"15": 64,
|
28 |
+
"16": 21,
|
29 |
+
"17": 94,
|
30 |
+
"18": 156,
|
31 |
+
"19": 148,
|
32 |
+
"20": 256,
|
33 |
+
"21": 136,
|
34 |
+
"22": 282,
|
35 |
+
"23": 73,
|
36 |
+
"24": 373,
|
37 |
+
"25": 354,
|
38 |
+
"26": 265,
|
39 |
+
"27": 85,
|
40 |
+
"28": 44,
|
41 |
+
"29": 237,
|
42 |
+
"30": 49,
|
43 |
+
"31": 71,
|
44 |
+
"32": 374,
|
45 |
+
"33": 249,
|
46 |
+
"34": 215,
|
47 |
+
"35": 44,
|
48 |
+
"36": 337,
|
49 |
+
"37": 118,
|
50 |
+
"38": 387,
|
51 |
+
"39": 306,
|
52 |
+
"40": 119,
|
53 |
+
"41": 361,
|
54 |
+
"42": 361,
|
55 |
+
"43": 285,
|
56 |
+
"44": 278,
|
57 |
+
"45": 130,
|
58 |
+
"46": 318,
|
59 |
+
"47": 135,
|
60 |
+
"48": 391,
|
61 |
+
"49": 172,
|
62 |
+
"50": 268,
|
63 |
+
"51": 61,
|
64 |
+
"52": 232,
|
65 |
+
"53": 88,
|
66 |
+
"54": 363,
|
67 |
+
"55": 79,
|
68 |
+
"56": 352,
|
69 |
+
"57": 57,
|
70 |
+
"58": 205,
|
71 |
+
"59": 115,
|
72 |
+
"60": 51,
|
73 |
+
"61": 70,
|
74 |
+
"62": 197,
|
75 |
+
"63": 116
|
76 |
+
}
|
77 |
+
}
|
L18A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 18,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 31.758373260498047,
|
17 |
+
"out": 2.5856893062591553
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L18A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5dc42a30ce6e391e61b142aafba75c0aa18895756149da495f7243341430f04
|
3 |
+
size 1614040466
|
L18A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0003417022,
|
3 |
+
"explained_variance": 0.78765,
|
4 |
+
"l1": 16.81,
|
5 |
+
"ground_truth_norm": 2.625,
|
6 |
+
"reconstructed_norm": 2.361,
|
7 |
+
"error_norm": 1.1308,
|
8 |
+
"sparsity/below 1e-5": 18686,
|
9 |
+
"sparsity/below 1e-6": 17927,
|
10 |
+
"positivity": 0.48187,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 460,
|
13 |
+
"1": 349,
|
14 |
+
"2": 470,
|
15 |
+
"3": 253,
|
16 |
+
"4": 337,
|
17 |
+
"5": 343,
|
18 |
+
"6": 99,
|
19 |
+
"7": 414,
|
20 |
+
"8": 139,
|
21 |
+
"9": 172,
|
22 |
+
"10": 185,
|
23 |
+
"11": 43,
|
24 |
+
"12": 418,
|
25 |
+
"13": 387,
|
26 |
+
"14": 265,
|
27 |
+
"15": 419,
|
28 |
+
"16": 170,
|
29 |
+
"17": 48,
|
30 |
+
"18": 173,
|
31 |
+
"19": 255,
|
32 |
+
"20": 297,
|
33 |
+
"21": 196,
|
34 |
+
"22": 118,
|
35 |
+
"23": 145,
|
36 |
+
"24": 143,
|
37 |
+
"25": 402,
|
38 |
+
"26": 151,
|
39 |
+
"27": 451,
|
40 |
+
"28": 76,
|
41 |
+
"29": 345,
|
42 |
+
"30": 254,
|
43 |
+
"31": 208,
|
44 |
+
"32": 213,
|
45 |
+
"33": 164,
|
46 |
+
"34": 299,
|
47 |
+
"35": 163,
|
48 |
+
"36": 309,
|
49 |
+
"37": 311,
|
50 |
+
"38": 197,
|
51 |
+
"39": 227,
|
52 |
+
"40": 221,
|
53 |
+
"41": 315,
|
54 |
+
"42": 127,
|
55 |
+
"43": 389,
|
56 |
+
"44": 272,
|
57 |
+
"45": 160,
|
58 |
+
"46": 74,
|
59 |
+
"47": 54,
|
60 |
+
"48": 423,
|
61 |
+
"49": 127,
|
62 |
+
"50": 317,
|
63 |
+
"51": 59,
|
64 |
+
"52": 395,
|
65 |
+
"53": 261,
|
66 |
+
"54": 406,
|
67 |
+
"55": 306,
|
68 |
+
"56": 165,
|
69 |
+
"57": 276,
|
70 |
+
"58": 96,
|
71 |
+
"59": 347,
|
72 |
+
"60": 170,
|
73 |
+
"61": 339,
|
74 |
+
"62": 394,
|
75 |
+
"63": 29
|
76 |
+
}
|
77 |
+
}
|
L19A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 19,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 31.919294357299805,
|
17 |
+
"out": 2.284219980239868
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L19A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1c138b05a313c4085b599ba89e1fd445b66f23ff0ec12023139180cecb7a3ce
|
3 |
+
size 1614040466
|
L19A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0003078198,
|
3 |
+
"explained_variance": 0.76338,
|
4 |
+
"l1": 15.76,
|
5 |
+
"ground_truth_norm": 2.332,
|
6 |
+
"reconstructed_norm": 2.067,
|
7 |
+
"error_norm": 1.0594,
|
8 |
+
"sparsity/below 1e-5": 16110,
|
9 |
+
"sparsity/below 1e-6": 14863,
|
10 |
+
"positivity": 0.56973,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 409,
|
13 |
+
"1": 263,
|
14 |
+
"2": 416,
|
15 |
+
"3": 195,
|
16 |
+
"4": 357,
|
17 |
+
"5": 319,
|
18 |
+
"6": 162,
|
19 |
+
"7": 444,
|
20 |
+
"8": 244,
|
21 |
+
"9": 109,
|
22 |
+
"10": 219,
|
23 |
+
"11": 417,
|
24 |
+
"12": 278,
|
25 |
+
"13": 299,
|
26 |
+
"14": 350,
|
27 |
+
"15": 420,
|
28 |
+
"16": 209,
|
29 |
+
"17": 39,
|
30 |
+
"18": 362,
|
31 |
+
"19": 331,
|
32 |
+
"20": 204,
|
33 |
+
"21": 195,
|
34 |
+
"22": 415,
|
35 |
+
"23": 382,
|
36 |
+
"24": 350,
|
37 |
+
"25": 106,
|
38 |
+
"26": 342,
|
39 |
+
"27": 201,
|
40 |
+
"28": 334,
|
41 |
+
"29": 324,
|
42 |
+
"30": 390,
|
43 |
+
"31": 385,
|
44 |
+
"32": 412,
|
45 |
+
"33": 373,
|
46 |
+
"34": 449,
|
47 |
+
"35": 224,
|
48 |
+
"36": 296,
|
49 |
+
"37": 345,
|
50 |
+
"38": 86,
|
51 |
+
"39": 422,
|
52 |
+
"40": 327,
|
53 |
+
"41": 67,
|
54 |
+
"42": 447,
|
55 |
+
"43": 159,
|
56 |
+
"44": 278,
|
57 |
+
"45": 251,
|
58 |
+
"46": 303,
|
59 |
+
"47": 357,
|
60 |
+
"48": 147,
|
61 |
+
"49": 48,
|
62 |
+
"50": 301,
|
63 |
+
"51": 154,
|
64 |
+
"52": 177,
|
65 |
+
"53": 252,
|
66 |
+
"54": 435,
|
67 |
+
"55": 173,
|
68 |
+
"56": 445,
|
69 |
+
"57": 379,
|
70 |
+
"58": 175,
|
71 |
+
"59": 359,
|
72 |
+
"60": 399,
|
73 |
+
"61": 364,
|
74 |
+
"62": 251,
|
75 |
+
"63": 344
|
76 |
+
}
|
77 |
+
}
|
L20A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 20,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 32.97787094116211,
|
17 |
+
"out": 2.3633012771606445
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L20A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63b1bc4cde7614df26b9cadb4d6cb29796862ee6167c444359d1f6cc4f6f98b2
|
3 |
+
size 1614040466
|
L20A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0003548504,
|
3 |
+
"explained_variance": 0.75766,
|
4 |
+
"l1": 16.56,
|
5 |
+
"ground_truth_norm": 2.379,
|
6 |
+
"reconstructed_norm": 2.083,
|
7 |
+
"error_norm": 1.1285,
|
8 |
+
"sparsity/below 1e-5": 17100,
|
9 |
+
"sparsity/below 1e-6": 16072,
|
10 |
+
"positivity": 0.5462,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 419,
|
13 |
+
"1": 199,
|
14 |
+
"2": 234,
|
15 |
+
"3": 327,
|
16 |
+
"4": 80,
|
17 |
+
"5": 479,
|
18 |
+
"6": 245,
|
19 |
+
"7": 362,
|
20 |
+
"8": 93,
|
21 |
+
"9": 449,
|
22 |
+
"10": 274,
|
23 |
+
"11": 78,
|
24 |
+
"12": 298,
|
25 |
+
"13": 315,
|
26 |
+
"14": 406,
|
27 |
+
"15": 257,
|
28 |
+
"16": 353,
|
29 |
+
"17": 162,
|
30 |
+
"18": 330,
|
31 |
+
"19": 359,
|
32 |
+
"20": 139,
|
33 |
+
"21": 102,
|
34 |
+
"22": 371,
|
35 |
+
"23": 330,
|
36 |
+
"24": 444,
|
37 |
+
"25": 309,
|
38 |
+
"26": 276,
|
39 |
+
"27": 160,
|
40 |
+
"28": 208,
|
41 |
+
"29": 345,
|
42 |
+
"30": 39,
|
43 |
+
"31": 93,
|
44 |
+
"32": 264,
|
45 |
+
"33": 58,
|
46 |
+
"34": 221,
|
47 |
+
"35": 229,
|
48 |
+
"36": 411,
|
49 |
+
"37": 212,
|
50 |
+
"38": 323,
|
51 |
+
"39": 335,
|
52 |
+
"40": 282,
|
53 |
+
"41": 327,
|
54 |
+
"42": 430,
|
55 |
+
"43": 308,
|
56 |
+
"44": 298,
|
57 |
+
"45": 420,
|
58 |
+
"46": 420,
|
59 |
+
"47": 279,
|
60 |
+
"48": 179,
|
61 |
+
"49": 193,
|
62 |
+
"50": 146,
|
63 |
+
"51": 374,
|
64 |
+
"52": 449,
|
65 |
+
"53": 423,
|
66 |
+
"54": 261,
|
67 |
+
"55": 311,
|
68 |
+
"56": 462,
|
69 |
+
"57": 118,
|
70 |
+
"58": 398,
|
71 |
+
"59": 107,
|
72 |
+
"60": 147,
|
73 |
+
"61": 341,
|
74 |
+
"62": 273,
|
75 |
+
"63": 364
|
76 |
+
}
|
77 |
+
}
|
L21A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 21,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 32.402626037597656,
|
17 |
+
"out": 3.294581651687622
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L21A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:551b4bd4081bce1b00c2cf43eeaefb76498251abd586663728edcbb31e78c91a
|
3 |
+
size 1614040466
|
L21A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0005338052,
|
3 |
+
"explained_variance": 0.80594,
|
4 |
+
"l1": 21.32,
|
5 |
+
"ground_truth_norm": 3.356,
|
6 |
+
"reconstructed_norm": 3.039,
|
7 |
+
"error_norm": 1.4049,
|
8 |
+
"sparsity/below 1e-5": 19748,
|
9 |
+
"sparsity/below 1e-6": 19430,
|
10 |
+
"positivity": 0.44174,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 85,
|
13 |
+
"1": 155,
|
14 |
+
"2": 197,
|
15 |
+
"3": 284,
|
16 |
+
"4": 319,
|
17 |
+
"5": 171,
|
18 |
+
"6": 218,
|
19 |
+
"7": 375,
|
20 |
+
"8": 86,
|
21 |
+
"9": 91,
|
22 |
+
"10": 352,
|
23 |
+
"11": 186,
|
24 |
+
"12": 356,
|
25 |
+
"13": 116,
|
26 |
+
"14": 289,
|
27 |
+
"15": 77,
|
28 |
+
"16": 220,
|
29 |
+
"17": 409,
|
30 |
+
"18": 84,
|
31 |
+
"19": 106,
|
32 |
+
"20": 89,
|
33 |
+
"21": 345,
|
34 |
+
"22": 311,
|
35 |
+
"23": 271,
|
36 |
+
"24": 181,
|
37 |
+
"25": 138,
|
38 |
+
"26": 282,
|
39 |
+
"27": 72,
|
40 |
+
"28": 124,
|
41 |
+
"29": 292,
|
42 |
+
"30": 199,
|
43 |
+
"31": 219,
|
44 |
+
"32": 230,
|
45 |
+
"33": 137,
|
46 |
+
"34": 362,
|
47 |
+
"35": 258,
|
48 |
+
"36": 226,
|
49 |
+
"37": 260,
|
50 |
+
"38": 287,
|
51 |
+
"39": 199,
|
52 |
+
"40": 315,
|
53 |
+
"41": 109,
|
54 |
+
"42": 311,
|
55 |
+
"43": 303,
|
56 |
+
"44": 281,
|
57 |
+
"45": 330,
|
58 |
+
"46": 390,
|
59 |
+
"47": 175,
|
60 |
+
"48": 307,
|
61 |
+
"49": 224,
|
62 |
+
"50": 292,
|
63 |
+
"51": 222,
|
64 |
+
"52": 446,
|
65 |
+
"53": 276,
|
66 |
+
"54": 262,
|
67 |
+
"55": 57,
|
68 |
+
"56": 209,
|
69 |
+
"57": 90,
|
70 |
+
"58": 151,
|
71 |
+
"59": 216,
|
72 |
+
"60": 184,
|
73 |
+
"61": 87,
|
74 |
+
"62": 362,
|
75 |
+
"63": 218
|
76 |
+
}
|
77 |
+
}
|
L22A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 22,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 34.090354919433594,
|
17 |
+
"out": 2.561861753463745
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L22A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:632b82ac74d6c1a42d3da0b866e87d4ce2efa80bfde820f5144ff7d86084a830
|
3 |
+
size 1614040466
|
L22A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0004232735,
|
3 |
+
"explained_variance": 0.76295,
|
4 |
+
"l1": 17.34,
|
5 |
+
"ground_truth_norm": 2.654,
|
6 |
+
"reconstructed_norm": 2.344,
|
7 |
+
"error_norm": 1.2268,
|
8 |
+
"sparsity/below 1e-5": 16999,
|
9 |
+
"sparsity/below 1e-6": 16475,
|
10 |
+
"positivity": 0.52084,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 184,
|
13 |
+
"1": 358,
|
14 |
+
"2": 169,
|
15 |
+
"3": 436,
|
16 |
+
"4": 375,
|
17 |
+
"5": 156,
|
18 |
+
"6": 52,
|
19 |
+
"7": 228,
|
20 |
+
"8": 286,
|
21 |
+
"9": 27,
|
22 |
+
"10": 274,
|
23 |
+
"11": 254,
|
24 |
+
"12": 38,
|
25 |
+
"13": 395,
|
26 |
+
"14": 154,
|
27 |
+
"15": 453,
|
28 |
+
"16": 430,
|
29 |
+
"17": 426,
|
30 |
+
"18": 321,
|
31 |
+
"19": 396,
|
32 |
+
"20": 119,
|
33 |
+
"21": 307,
|
34 |
+
"22": 284,
|
35 |
+
"23": 277,
|
36 |
+
"24": 66,
|
37 |
+
"25": 261,
|
38 |
+
"26": 156,
|
39 |
+
"27": 447,
|
40 |
+
"28": 368,
|
41 |
+
"29": 329,
|
42 |
+
"30": 400,
|
43 |
+
"31": 360,
|
44 |
+
"32": 370,
|
45 |
+
"33": 363,
|
46 |
+
"34": 91,
|
47 |
+
"35": 264,
|
48 |
+
"36": 145,
|
49 |
+
"37": 247,
|
50 |
+
"38": 38,
|
51 |
+
"39": 124,
|
52 |
+
"40": 416,
|
53 |
+
"41": 252,
|
54 |
+
"42": 422,
|
55 |
+
"43": 274,
|
56 |
+
"44": 261,
|
57 |
+
"45": 84,
|
58 |
+
"46": 260,
|
59 |
+
"47": 295,
|
60 |
+
"48": 268,
|
61 |
+
"49": 324,
|
62 |
+
"50": 161,
|
63 |
+
"51": 176,
|
64 |
+
"52": 86,
|
65 |
+
"53": 219,
|
66 |
+
"54": 369,
|
67 |
+
"55": 285,
|
68 |
+
"56": 250,
|
69 |
+
"57": 395,
|
70 |
+
"58": 382,
|
71 |
+
"59": 273,
|
72 |
+
"60": 206,
|
73 |
+
"61": 388,
|
74 |
+
"62": 418,
|
75 |
+
"63": 175
|
76 |
+
}
|
77 |
+
}
|
L23A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 23,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 33.478187561035156,
|
17 |
+
"out": 2.9890081882476807
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L23A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6955da6d7a1e9375d0b67241d9f99f1143f752d1aa25ed4de710927f0746dacc
|
3 |
+
size 1614040466
|
L23A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0004005697,
|
3 |
+
"explained_variance": 0.80796,
|
4 |
+
"l1": 18.76,
|
5 |
+
"ground_truth_norm": 2.916,
|
6 |
+
"reconstructed_norm": 2.651,
|
7 |
+
"error_norm": 1.1865,
|
8 |
+
"sparsity/below 1e-5": 15034,
|
9 |
+
"sparsity/below 1e-6": 14151,
|
10 |
+
"positivity": 0.64301,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 292,
|
13 |
+
"1": 314,
|
14 |
+
"2": 155,
|
15 |
+
"3": 317,
|
16 |
+
"4": 338,
|
17 |
+
"5": 308,
|
18 |
+
"6": 276,
|
19 |
+
"7": 123,
|
20 |
+
"8": 436,
|
21 |
+
"9": 274,
|
22 |
+
"10": 448,
|
23 |
+
"11": 430,
|
24 |
+
"12": 355,
|
25 |
+
"13": 341,
|
26 |
+
"14": 112,
|
27 |
+
"15": 192,
|
28 |
+
"16": 180,
|
29 |
+
"17": 395,
|
30 |
+
"18": 169,
|
31 |
+
"19": 414,
|
32 |
+
"20": 395,
|
33 |
+
"21": 404,
|
34 |
+
"22": 309,
|
35 |
+
"23": 490,
|
36 |
+
"24": 437,
|
37 |
+
"25": 419,
|
38 |
+
"26": 411,
|
39 |
+
"27": 131,
|
40 |
+
"28": 267,
|
41 |
+
"29": 364,
|
42 |
+
"30": 119,
|
43 |
+
"31": 471,
|
44 |
+
"32": 351,
|
45 |
+
"33": 375,
|
46 |
+
"34": 169,
|
47 |
+
"35": 448,
|
48 |
+
"36": 493,
|
49 |
+
"37": 270,
|
50 |
+
"38": 176,
|
51 |
+
"39": 339,
|
52 |
+
"40": 182,
|
53 |
+
"41": 336,
|
54 |
+
"42": 357,
|
55 |
+
"43": 367,
|
56 |
+
"44": 478,
|
57 |
+
"45": 335,
|
58 |
+
"46": 407,
|
59 |
+
"47": 498,
|
60 |
+
"48": 97,
|
61 |
+
"49": 433,
|
62 |
+
"50": 414,
|
63 |
+
"51": 423,
|
64 |
+
"52": 342,
|
65 |
+
"53": 111,
|
66 |
+
"54": 389,
|
67 |
+
"55": 344,
|
68 |
+
"56": 269,
|
69 |
+
"57": 379,
|
70 |
+
"58": 273,
|
71 |
+
"59": 358,
|
72 |
+
"60": 411,
|
73 |
+
"61": 443,
|
74 |
+
"62": 229,
|
75 |
+
"63": 488
|
76 |
+
}
|
77 |
+
}
|