Upload folder using huggingface_hub
Browse files- L24A/config.json +31 -0
- L24A/final.pth +3 -0
- L24A/metrics.json +77 -0
- L25A/config.json +31 -0
- L25A/final.pth +3 -0
- L25A/metrics.json +77 -0
- L26A/config.json +31 -0
- L26A/final.pth +3 -0
- L26A/metrics.json +77 -0
- L27A/config.json +31 -0
- L27A/final.pth +3 -0
- L27A/metrics.json +77 -0
- L28A/config.json +31 -0
- L28A/final.pth +3 -0
- L28A/metrics.json +77 -0
- L29A/config.json +31 -0
- L29A/final.pth +3 -0
- L29A/metrics.json +77 -0
- L30A/config.json +31 -0
- L30A/final.pth +3 -0
- L30A/metrics.json +77 -0
- L31A/config.json +31 -0
- L31A/final.pth +3 -0
- L31A/metrics.json +77 -0
L24A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 24,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 34.08556365966797,
|
17 |
+
"out": 2.8631839752197266
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L24A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6db3ce7561a7747e867823abd107d96cf142bebaa37a081f2454dbf4a8b7fcaf
|
3 |
+
size 1614040466
|
L24A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0004386434,
|
3 |
+
"explained_variance": 0.79569,
|
4 |
+
"l1": 19.61,
|
5 |
+
"ground_truth_norm": 2.987,
|
6 |
+
"reconstructed_norm": 2.709,
|
7 |
+
"error_norm": 1.2139,
|
8 |
+
"sparsity/below 1e-5": 16085,
|
9 |
+
"sparsity/below 1e-6": 15078,
|
10 |
+
"positivity": 0.57111,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 161,
|
13 |
+
"1": 392,
|
14 |
+
"2": 260,
|
15 |
+
"3": 443,
|
16 |
+
"4": 449,
|
17 |
+
"5": 457,
|
18 |
+
"6": 462,
|
19 |
+
"7": 136,
|
20 |
+
"8": 508,
|
21 |
+
"9": 210,
|
22 |
+
"10": 240,
|
23 |
+
"11": 451,
|
24 |
+
"12": 454,
|
25 |
+
"13": 33,
|
26 |
+
"14": 90,
|
27 |
+
"15": 355,
|
28 |
+
"16": 500,
|
29 |
+
"17": 459,
|
30 |
+
"18": 191,
|
31 |
+
"19": 176,
|
32 |
+
"20": 402,
|
33 |
+
"21": 161,
|
34 |
+
"22": 468,
|
35 |
+
"23": 480,
|
36 |
+
"24": 43,
|
37 |
+
"25": 325,
|
38 |
+
"26": 437,
|
39 |
+
"27": 230,
|
40 |
+
"28": 113,
|
41 |
+
"29": 332,
|
42 |
+
"30": 326,
|
43 |
+
"31": 304,
|
44 |
+
"32": 320,
|
45 |
+
"33": 356,
|
46 |
+
"34": 268,
|
47 |
+
"35": 264,
|
48 |
+
"36": 464,
|
49 |
+
"37": 388,
|
50 |
+
"38": 121,
|
51 |
+
"39": 113,
|
52 |
+
"40": 40,
|
53 |
+
"41": 479,
|
54 |
+
"42": 221,
|
55 |
+
"43": 350,
|
56 |
+
"44": 243,
|
57 |
+
"45": 249,
|
58 |
+
"46": 389,
|
59 |
+
"47": 487,
|
60 |
+
"48": 447,
|
61 |
+
"49": 372,
|
62 |
+
"50": 140,
|
63 |
+
"51": 136,
|
64 |
+
"52": 201,
|
65 |
+
"53": 23,
|
66 |
+
"54": 403,
|
67 |
+
"55": 309,
|
68 |
+
"56": 178,
|
69 |
+
"57": 57,
|
70 |
+
"58": 63,
|
71 |
+
"59": 450,
|
72 |
+
"60": 436,
|
73 |
+
"61": 298,
|
74 |
+
"62": 75,
|
75 |
+
"63": 326
|
76 |
+
}
|
77 |
+
}
|
L25A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 25,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 34.21133041381836,
|
17 |
+
"out": 3.386951446533203
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L25A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:933b326e066a790ab90883a761404a4cc32aa10a93a77d256e2dc9132e6d6412
|
3 |
+
size 1614040466
|
L25A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0005801773,
|
3 |
+
"explained_variance": 0.79763,
|
4 |
+
"l1": 23.5,
|
5 |
+
"ground_truth_norm": 3.534,
|
6 |
+
"reconstructed_norm": 3.213,
|
7 |
+
"error_norm": 1.431,
|
8 |
+
"sparsity/below 1e-5": 16451,
|
9 |
+
"sparsity/below 1e-6": 15619,
|
10 |
+
"positivity": 0.53839,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 366,
|
13 |
+
"1": 263,
|
14 |
+
"2": 337,
|
15 |
+
"3": 308,
|
16 |
+
"4": 359,
|
17 |
+
"5": 371,
|
18 |
+
"6": 224,
|
19 |
+
"7": 307,
|
20 |
+
"8": 37,
|
21 |
+
"9": 269,
|
22 |
+
"10": 365,
|
23 |
+
"11": 409,
|
24 |
+
"12": 398,
|
25 |
+
"13": 477,
|
26 |
+
"14": 369,
|
27 |
+
"15": 145,
|
28 |
+
"16": 58,
|
29 |
+
"17": 368,
|
30 |
+
"18": 377,
|
31 |
+
"19": 368,
|
32 |
+
"20": 250,
|
33 |
+
"21": 426,
|
34 |
+
"22": 316,
|
35 |
+
"23": 368,
|
36 |
+
"24": 271,
|
37 |
+
"25": 39,
|
38 |
+
"26": 258,
|
39 |
+
"27": 440,
|
40 |
+
"28": 274,
|
41 |
+
"29": 230,
|
42 |
+
"30": 311,
|
43 |
+
"31": 269,
|
44 |
+
"32": 6,
|
45 |
+
"33": 404,
|
46 |
+
"34": 453,
|
47 |
+
"35": 364,
|
48 |
+
"36": 378,
|
49 |
+
"37": 369,
|
50 |
+
"38": 153,
|
51 |
+
"39": 75,
|
52 |
+
"40": 94,
|
53 |
+
"41": 408,
|
54 |
+
"42": 438,
|
55 |
+
"43": 327,
|
56 |
+
"44": 318,
|
57 |
+
"45": 310,
|
58 |
+
"46": 361,
|
59 |
+
"47": 310,
|
60 |
+
"48": 338,
|
61 |
+
"49": 9,
|
62 |
+
"50": 10,
|
63 |
+
"51": 246,
|
64 |
+
"52": 98,
|
65 |
+
"53": 261,
|
66 |
+
"54": 148,
|
67 |
+
"55": 333,
|
68 |
+
"56": 475,
|
69 |
+
"57": 480,
|
70 |
+
"58": 94,
|
71 |
+
"59": 122,
|
72 |
+
"60": 239,
|
73 |
+
"61": 279,
|
74 |
+
"62": 83,
|
75 |
+
"63": 32
|
76 |
+
}
|
77 |
+
}
|
L26A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 26,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 33.632904052734375,
|
17 |
+
"out": 4.672374248504639
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L26A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aef09c960771267a2eb513c33913bb059e799af318d1b94810d16350c0f4f0f1
|
3 |
+
size 1614040466
|
L26A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0009872384,
|
3 |
+
"explained_variance": 0.75272,
|
4 |
+
"l1": 25.33,
|
5 |
+
"ground_truth_norm": 4.741,
|
6 |
+
"reconstructed_norm": 4.321,
|
7 |
+
"error_norm": 1.8952,
|
8 |
+
"sparsity/below 1e-5": 18803,
|
9 |
+
"sparsity/below 1e-6": 18155,
|
10 |
+
"positivity": 0.4664,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 410,
|
13 |
+
"1": 101,
|
14 |
+
"2": 471,
|
15 |
+
"3": 382,
|
16 |
+
"4": 104,
|
17 |
+
"5": 92,
|
18 |
+
"6": 113,
|
19 |
+
"7": 199,
|
20 |
+
"8": 377,
|
21 |
+
"9": 436,
|
22 |
+
"10": 144,
|
23 |
+
"11": 249,
|
24 |
+
"12": 367,
|
25 |
+
"13": 293,
|
26 |
+
"14": 71,
|
27 |
+
"15": 333,
|
28 |
+
"16": 290,
|
29 |
+
"17": 322,
|
30 |
+
"18": 273,
|
31 |
+
"19": 243,
|
32 |
+
"20": 438,
|
33 |
+
"21": 65,
|
34 |
+
"22": 228,
|
35 |
+
"23": 222,
|
36 |
+
"24": 309,
|
37 |
+
"25": 65,
|
38 |
+
"26": 74,
|
39 |
+
"27": 392,
|
40 |
+
"28": 381,
|
41 |
+
"29": 390,
|
42 |
+
"30": 286,
|
43 |
+
"31": 77,
|
44 |
+
"32": 324,
|
45 |
+
"33": 404,
|
46 |
+
"34": 32,
|
47 |
+
"35": 5,
|
48 |
+
"36": 167,
|
49 |
+
"37": 183,
|
50 |
+
"38": 37,
|
51 |
+
"39": 251,
|
52 |
+
"40": 112,
|
53 |
+
"41": 382,
|
54 |
+
"42": 188,
|
55 |
+
"43": 145,
|
56 |
+
"44": 279,
|
57 |
+
"45": 372,
|
58 |
+
"46": 123,
|
59 |
+
"47": 182,
|
60 |
+
"48": 66,
|
61 |
+
"49": 98,
|
62 |
+
"50": 242,
|
63 |
+
"51": 251,
|
64 |
+
"52": 265,
|
65 |
+
"53": 350,
|
66 |
+
"54": 346,
|
67 |
+
"55": 26,
|
68 |
+
"56": 351,
|
69 |
+
"57": 360,
|
70 |
+
"58": 80,
|
71 |
+
"59": 305,
|
72 |
+
"60": 409,
|
73 |
+
"61": 305,
|
74 |
+
"62": 366,
|
75 |
+
"63": 80
|
76 |
+
}
|
77 |
+
}
|
L27A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 27,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 34.473514556884766,
|
17 |
+
"out": 4.5733642578125
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L27A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af816145e6fec79bf175936f6112b908abd3a85634518d54cd7b0195211daed3
|
3 |
+
size 1614040466
|
L27A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0010833642,
|
3 |
+
"explained_variance": 0.76039,
|
4 |
+
"l1": 28.78,
|
5 |
+
"ground_truth_norm": 4.677,
|
6 |
+
"reconstructed_norm": 4.221,
|
7 |
+
"error_norm": 1.9506,
|
8 |
+
"sparsity/below 1e-5": 17314,
|
9 |
+
"sparsity/below 1e-6": 16541,
|
10 |
+
"positivity": 0.51071,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 227,
|
13 |
+
"1": 424,
|
14 |
+
"2": 132,
|
15 |
+
"3": 260,
|
16 |
+
"4": 447,
|
17 |
+
"5": 510,
|
18 |
+
"6": 381,
|
19 |
+
"7": 470,
|
20 |
+
"8": 40,
|
21 |
+
"9": 354,
|
22 |
+
"10": 225,
|
23 |
+
"11": 161,
|
24 |
+
"12": 448,
|
25 |
+
"13": 34,
|
26 |
+
"14": 321,
|
27 |
+
"15": 437,
|
28 |
+
"16": 308,
|
29 |
+
"17": 438,
|
30 |
+
"18": 199,
|
31 |
+
"19": 29,
|
32 |
+
"20": 60,
|
33 |
+
"21": 189,
|
34 |
+
"22": 264,
|
35 |
+
"23": 453,
|
36 |
+
"24": 295,
|
37 |
+
"25": 468,
|
38 |
+
"26": 194,
|
39 |
+
"27": 176,
|
40 |
+
"28": 36,
|
41 |
+
"29": 330,
|
42 |
+
"30": 257,
|
43 |
+
"31": 65,
|
44 |
+
"32": 228,
|
45 |
+
"33": 179,
|
46 |
+
"34": 299,
|
47 |
+
"35": 284,
|
48 |
+
"36": 221,
|
49 |
+
"37": 322,
|
50 |
+
"38": 57,
|
51 |
+
"39": 468,
|
52 |
+
"40": 103,
|
53 |
+
"41": 394,
|
54 |
+
"42": 377,
|
55 |
+
"43": 329,
|
56 |
+
"44": 65,
|
57 |
+
"45": 346,
|
58 |
+
"46": 292,
|
59 |
+
"47": 384,
|
60 |
+
"48": 62,
|
61 |
+
"49": 275,
|
62 |
+
"50": 331,
|
63 |
+
"51": 3,
|
64 |
+
"52": 244,
|
65 |
+
"53": 387,
|
66 |
+
"54": 38,
|
67 |
+
"55": 425,
|
68 |
+
"56": 449,
|
69 |
+
"57": 9,
|
70 |
+
"58": 452,
|
71 |
+
"59": 228,
|
72 |
+
"60": 36,
|
73 |
+
"61": 189,
|
74 |
+
"62": 295,
|
75 |
+
"63": 332
|
76 |
+
}
|
77 |
+
}
|
L28A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 28,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 34.196075439453125,
|
17 |
+
"out": 5.965426921844482
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L28A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0793015d991e07b523b99ba4bccf68aaea52a77eb65b00fbe2a7df4de3390a03
|
3 |
+
size 1614040466
|
L28A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0016028101,
|
3 |
+
"explained_variance": 0.71255,
|
4 |
+
"l1": 31.98,
|
5 |
+
"ground_truth_norm": 5.944,
|
6 |
+
"reconstructed_norm": 5.393,
|
7 |
+
"error_norm": 2.4326,
|
8 |
+
"sparsity/below 1e-5": 19217,
|
9 |
+
"sparsity/below 1e-6": 18760,
|
10 |
+
"positivity": 0.43726,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 408,
|
13 |
+
"1": 166,
|
14 |
+
"2": 355,
|
15 |
+
"3": 215,
|
16 |
+
"4": 481,
|
17 |
+
"5": 54,
|
18 |
+
"6": 49,
|
19 |
+
"7": 319,
|
20 |
+
"8": 415,
|
21 |
+
"9": 85,
|
22 |
+
"10": 13,
|
23 |
+
"11": 127,
|
24 |
+
"12": 454,
|
25 |
+
"13": 319,
|
26 |
+
"14": 307,
|
27 |
+
"15": 289,
|
28 |
+
"16": 161,
|
29 |
+
"17": 101,
|
30 |
+
"18": 60,
|
31 |
+
"19": 238,
|
32 |
+
"20": 423,
|
33 |
+
"21": 257,
|
34 |
+
"22": 165,
|
35 |
+
"23": 388,
|
36 |
+
"24": 89,
|
37 |
+
"25": 10,
|
38 |
+
"26": 164,
|
39 |
+
"27": 39,
|
40 |
+
"28": 0,
|
41 |
+
"29": 371,
|
42 |
+
"30": 122,
|
43 |
+
"31": 327,
|
44 |
+
"32": 210,
|
45 |
+
"33": 419,
|
46 |
+
"34": 164,
|
47 |
+
"35": 433,
|
48 |
+
"36": 130,
|
49 |
+
"37": 118,
|
50 |
+
"38": 94,
|
51 |
+
"39": 92,
|
52 |
+
"40": 6,
|
53 |
+
"41": 417,
|
54 |
+
"42": 64,
|
55 |
+
"43": 369,
|
56 |
+
"44": 343,
|
57 |
+
"45": 235,
|
58 |
+
"46": 422,
|
59 |
+
"47": 5,
|
60 |
+
"48": 294,
|
61 |
+
"49": 234,
|
62 |
+
"50": 357,
|
63 |
+
"51": 206,
|
64 |
+
"52": 0,
|
65 |
+
"53": 343,
|
66 |
+
"54": 396,
|
67 |
+
"55": 230,
|
68 |
+
"56": 366,
|
69 |
+
"57": 47,
|
70 |
+
"58": 356,
|
71 |
+
"59": 19,
|
72 |
+
"60": 133,
|
73 |
+
"61": 13,
|
74 |
+
"62": 450,
|
75 |
+
"63": 392
|
76 |
+
}
|
77 |
+
}
|
L29A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 29,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 34.094024658203125,
|
17 |
+
"out": 7.520937442779541
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L29A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52d1a387433aafd70e2a1ab578ee7d01298f6f12f5b2bd6ed70972ae4297fcaf
|
3 |
+
size 1614040466
|
L29A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.001713933,
|
3 |
+
"explained_variance": 0.78354,
|
4 |
+
"l1": 39.04,
|
5 |
+
"ground_truth_norm": 7.541,
|
6 |
+
"reconstructed_norm": 7.079,
|
7 |
+
"error_norm": 2.4444,
|
8 |
+
"sparsity/below 1e-5": 23002,
|
9 |
+
"sparsity/below 1e-6": 22114,
|
10 |
+
"positivity": 0.34143,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 432,
|
13 |
+
"1": 236,
|
14 |
+
"2": 18,
|
15 |
+
"3": 220,
|
16 |
+
"4": 390,
|
17 |
+
"5": 302,
|
18 |
+
"6": 408,
|
19 |
+
"7": 91,
|
20 |
+
"8": 109,
|
21 |
+
"9": 58,
|
22 |
+
"10": 49,
|
23 |
+
"11": 303,
|
24 |
+
"12": 239,
|
25 |
+
"13": 217,
|
26 |
+
"14": 196,
|
27 |
+
"15": 49,
|
28 |
+
"16": 186,
|
29 |
+
"17": 30,
|
30 |
+
"18": 155,
|
31 |
+
"19": 184,
|
32 |
+
"20": 143,
|
33 |
+
"21": 190,
|
34 |
+
"22": 235,
|
35 |
+
"23": 46,
|
36 |
+
"24": 227,
|
37 |
+
"25": 211,
|
38 |
+
"26": 299,
|
39 |
+
"27": 146,
|
40 |
+
"28": 306,
|
41 |
+
"29": 113,
|
42 |
+
"30": 117,
|
43 |
+
"31": 307,
|
44 |
+
"32": 60,
|
45 |
+
"33": 283,
|
46 |
+
"34": 71,
|
47 |
+
"35": 18,
|
48 |
+
"36": 66,
|
49 |
+
"37": 86,
|
50 |
+
"38": 243,
|
51 |
+
"39": 90,
|
52 |
+
"40": 72,
|
53 |
+
"41": 359,
|
54 |
+
"42": 68,
|
55 |
+
"43": 426,
|
56 |
+
"44": 210,
|
57 |
+
"45": 424,
|
58 |
+
"46": 91,
|
59 |
+
"47": 29,
|
60 |
+
"48": 99,
|
61 |
+
"49": 0,
|
62 |
+
"50": 7,
|
63 |
+
"51": 348,
|
64 |
+
"52": 216,
|
65 |
+
"53": 186,
|
66 |
+
"54": 120,
|
67 |
+
"55": 191,
|
68 |
+
"56": 25,
|
69 |
+
"57": 340,
|
70 |
+
"58": 365,
|
71 |
+
"59": 181,
|
72 |
+
"60": 64,
|
73 |
+
"61": 28,
|
74 |
+
"62": 201,
|
75 |
+
"63": 9
|
76 |
+
}
|
77 |
+
}
|
L30A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 30,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 34.17445755004883,
|
17 |
+
"out": 11.095989227294922
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L30A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b36319a4cd58d2ee25c7cf90a2aa5ef3f57440bda6cb47e965974e10b460c119
|
3 |
+
size 1614040466
|
L30A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0040983707,
|
3 |
+
"explained_variance": 0.73162,
|
4 |
+
"l1": 48.05,
|
5 |
+
"ground_truth_norm": 11.015,
|
6 |
+
"reconstructed_norm": 10.267,
|
7 |
+
"error_norm": 3.8362,
|
8 |
+
"sparsity/below 1e-5": 22716,
|
9 |
+
"sparsity/below 1e-6": 22018,
|
10 |
+
"positivity": 0.34238,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 255,
|
13 |
+
"1": 216,
|
14 |
+
"2": 0,
|
15 |
+
"3": 231,
|
16 |
+
"4": 172,
|
17 |
+
"5": 342,
|
18 |
+
"6": 250,
|
19 |
+
"7": 249,
|
20 |
+
"8": 107,
|
21 |
+
"9": 65,
|
22 |
+
"10": 4,
|
23 |
+
"11": 465,
|
24 |
+
"12": 5,
|
25 |
+
"13": 12,
|
26 |
+
"14": 161,
|
27 |
+
"15": 305,
|
28 |
+
"16": 38,
|
29 |
+
"17": 86,
|
30 |
+
"18": 163,
|
31 |
+
"19": 153,
|
32 |
+
"20": 127,
|
33 |
+
"21": 94,
|
34 |
+
"22": 157,
|
35 |
+
"23": 106,
|
36 |
+
"24": 303,
|
37 |
+
"25": 340,
|
38 |
+
"26": 143,
|
39 |
+
"27": 346,
|
40 |
+
"28": 258,
|
41 |
+
"29": 335,
|
42 |
+
"30": 46,
|
43 |
+
"31": 123,
|
44 |
+
"32": 17,
|
45 |
+
"33": 99,
|
46 |
+
"34": 69,
|
47 |
+
"35": 30,
|
48 |
+
"36": 261,
|
49 |
+
"37": 61,
|
50 |
+
"38": 83,
|
51 |
+
"39": 34,
|
52 |
+
"40": 64,
|
53 |
+
"41": 84,
|
54 |
+
"42": 35,
|
55 |
+
"43": 236,
|
56 |
+
"44": 143,
|
57 |
+
"45": 397,
|
58 |
+
"46": 131,
|
59 |
+
"47": 218,
|
60 |
+
"48": 452,
|
61 |
+
"49": 478,
|
62 |
+
"50": 113,
|
63 |
+
"51": 141,
|
64 |
+
"52": 342,
|
65 |
+
"53": 364,
|
66 |
+
"54": 82,
|
67 |
+
"55": 4,
|
68 |
+
"56": 142,
|
69 |
+
"57": 33,
|
70 |
+
"58": 265,
|
71 |
+
"59": 318,
|
72 |
+
"60": 433,
|
73 |
+
"61": 29,
|
74 |
+
"62": 21,
|
75 |
+
"63": 383
|
76 |
+
}
|
77 |
+
}
|
L31A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 31,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 28.58884620666504,
|
17 |
+
"out": 18.207286834716797
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L31A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62402678ec4e8895918806cf95462f9edb8cbf9f31a8bc755f54bcd8ff8e6609
|
3 |
+
size 1614040466
|
L31A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0050583091,
|
3 |
+
"explained_variance": 0.83675,
|
4 |
+
"l1": 97.56,
|
5 |
+
"ground_truth_norm": 18.24,
|
6 |
+
"reconstructed_norm": 17.639,
|
7 |
+
"error_norm": 4.4254,
|
8 |
+
"sparsity/below 1e-5": 24917,
|
9 |
+
"sparsity/below 1e-6": 24728,
|
10 |
+
"positivity": 0.24741,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 189,
|
13 |
+
"1": 3,
|
14 |
+
"2": 159,
|
15 |
+
"3": 82,
|
16 |
+
"4": 71,
|
17 |
+
"5": 139,
|
18 |
+
"6": 112,
|
19 |
+
"7": 353,
|
20 |
+
"8": 257,
|
21 |
+
"9": 168,
|
22 |
+
"10": 48,
|
23 |
+
"11": 46,
|
24 |
+
"12": 184,
|
25 |
+
"13": 92,
|
26 |
+
"14": 243,
|
27 |
+
"15": 2,
|
28 |
+
"16": 180,
|
29 |
+
"17": 233,
|
30 |
+
"18": 65,
|
31 |
+
"19": 127,
|
32 |
+
"20": 124,
|
33 |
+
"21": 105,
|
34 |
+
"22": 226,
|
35 |
+
"23": 6,
|
36 |
+
"24": 107,
|
37 |
+
"25": 131,
|
38 |
+
"26": 201,
|
39 |
+
"27": 114,
|
40 |
+
"28": 144,
|
41 |
+
"29": 108,
|
42 |
+
"30": 126,
|
43 |
+
"31": 74,
|
44 |
+
"32": 41,
|
45 |
+
"33": 219,
|
46 |
+
"34": 188,
|
47 |
+
"35": 23,
|
48 |
+
"36": 156,
|
49 |
+
"37": 215,
|
50 |
+
"38": 69,
|
51 |
+
"39": 134,
|
52 |
+
"40": 50,
|
53 |
+
"41": 19,
|
54 |
+
"42": 59,
|
55 |
+
"43": 133,
|
56 |
+
"44": 63,
|
57 |
+
"45": 111,
|
58 |
+
"46": 264,
|
59 |
+
"47": 65,
|
60 |
+
"48": 99,
|
61 |
+
"49": 181,
|
62 |
+
"50": 128,
|
63 |
+
"51": 239,
|
64 |
+
"52": 80,
|
65 |
+
"53": 139,
|
66 |
+
"54": 76,
|
67 |
+
"55": 190,
|
68 |
+
"56": 64,
|
69 |
+
"57": 91,
|
70 |
+
"58": 179,
|
71 |
+
"59": 34,
|
72 |
+
"60": 149,
|
73 |
+
"61": 143,
|
74 |
+
"62": 111,
|
75 |
+
"63": 176
|
76 |
+
}
|
77 |
+
}
|