Upload folder using huggingface_hub
Browse files- L2A/config.json +31 -0
- L2A/final.pth +3 -0
- L2A/metrics.json +77 -0
- L3A/config.json +31 -0
- L3A/final.pth +3 -0
- L3A/metrics.json +77 -0
- L4A/config.json +31 -0
- L4A/final.pth +3 -0
- L4A/metrics.json +77 -0
L2A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 2,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 25.82196044921875,
|
17 |
+
"out": 0.8914777040481567
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L2A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:292697deb870ed9e8ea3ff5db00f022f1181965d97a0db9a6b355465d5505413
|
3 |
+
size 1614040466
|
L2A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 1.25998e-05,
|
3 |
+
"explained_variance": 0.88186,
|
4 |
+
"l1": 4.18,
|
5 |
+
"ground_truth_norm": 0.889,
|
6 |
+
"reconstructed_norm": 0.862,
|
7 |
+
"error_norm": 0.2141,
|
8 |
+
"sparsity/below 1e-5": 16279,
|
9 |
+
"sparsity/below 1e-6": 15533,
|
10 |
+
"positivity": 0.54269,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 312,
|
13 |
+
"1": 338,
|
14 |
+
"2": 186,
|
15 |
+
"3": 141,
|
16 |
+
"4": 450,
|
17 |
+
"5": 439,
|
18 |
+
"6": 173,
|
19 |
+
"7": 377,
|
20 |
+
"8": 167,
|
21 |
+
"9": 128,
|
22 |
+
"10": 332,
|
23 |
+
"11": 349,
|
24 |
+
"12": 250,
|
25 |
+
"13": 171,
|
26 |
+
"14": 422,
|
27 |
+
"15": 303,
|
28 |
+
"16": 429,
|
29 |
+
"17": 188,
|
30 |
+
"18": 65,
|
31 |
+
"19": 360,
|
32 |
+
"20": 321,
|
33 |
+
"21": 215,
|
34 |
+
"22": 432,
|
35 |
+
"23": 426,
|
36 |
+
"24": 285,
|
37 |
+
"25": 167,
|
38 |
+
"26": 345,
|
39 |
+
"27": 167,
|
40 |
+
"28": 350,
|
41 |
+
"29": 461,
|
42 |
+
"30": 262,
|
43 |
+
"31": 271,
|
44 |
+
"32": 79,
|
45 |
+
"33": 134,
|
46 |
+
"34": 179,
|
47 |
+
"35": 356,
|
48 |
+
"36": 329,
|
49 |
+
"37": 433,
|
50 |
+
"38": 269,
|
51 |
+
"39": 84,
|
52 |
+
"40": 243,
|
53 |
+
"41": 443,
|
54 |
+
"42": 348,
|
55 |
+
"43": 196,
|
56 |
+
"44": 169,
|
57 |
+
"45": 333,
|
58 |
+
"46": 291,
|
59 |
+
"47": 171,
|
60 |
+
"48": 310,
|
61 |
+
"49": 291,
|
62 |
+
"50": 375,
|
63 |
+
"51": 327,
|
64 |
+
"52": 305,
|
65 |
+
"53": 92,
|
66 |
+
"54": 262,
|
67 |
+
"55": 207,
|
68 |
+
"56": 236,
|
69 |
+
"57": 328,
|
70 |
+
"58": 165,
|
71 |
+
"59": 395,
|
72 |
+
"60": 211,
|
73 |
+
"61": 451,
|
74 |
+
"62": 68,
|
75 |
+
"63": 421
|
76 |
+
}
|
77 |
+
}
|
L3A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 3,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 22.983654022216797,
|
17 |
+
"out": 1.1687864065170288
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L3A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96f03145dca4a61a1f544ef7ce14602c6492bd6c6c2350c784b4b0e70451446e
|
3 |
+
size 1614040466
|
L3A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 4.04951e-05,
|
3 |
+
"explained_variance": 0.82606,
|
4 |
+
"l1": 5.99,
|
5 |
+
"ground_truth_norm": 1.16,
|
6 |
+
"reconstructed_norm": 1.089,
|
7 |
+
"error_norm": 0.3918,
|
8 |
+
"sparsity/below 1e-5": 15990,
|
9 |
+
"sparsity/below 1e-6": 15516,
|
10 |
+
"positivity": 0.54037,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 290,
|
13 |
+
"1": 176,
|
14 |
+
"2": 267,
|
15 |
+
"3": 238,
|
16 |
+
"4": 334,
|
17 |
+
"5": 57,
|
18 |
+
"6": 472,
|
19 |
+
"7": 370,
|
20 |
+
"8": 150,
|
21 |
+
"9": 158,
|
22 |
+
"10": 340,
|
23 |
+
"11": 212,
|
24 |
+
"12": 254,
|
25 |
+
"13": 425,
|
26 |
+
"14": 233,
|
27 |
+
"15": 418,
|
28 |
+
"16": 465,
|
29 |
+
"17": 396,
|
30 |
+
"18": 458,
|
31 |
+
"19": 64,
|
32 |
+
"20": 346,
|
33 |
+
"21": 162,
|
34 |
+
"22": 166,
|
35 |
+
"23": 367,
|
36 |
+
"24": 270,
|
37 |
+
"25": 273,
|
38 |
+
"26": 185,
|
39 |
+
"27": 236,
|
40 |
+
"28": 230,
|
41 |
+
"29": 343,
|
42 |
+
"30": 410,
|
43 |
+
"31": 238,
|
44 |
+
"32": 207,
|
45 |
+
"33": 26,
|
46 |
+
"34": 129,
|
47 |
+
"35": 148,
|
48 |
+
"36": 104,
|
49 |
+
"37": 305,
|
50 |
+
"38": 478,
|
51 |
+
"39": 364,
|
52 |
+
"40": 177,
|
53 |
+
"41": 434,
|
54 |
+
"42": 360,
|
55 |
+
"43": 491,
|
56 |
+
"44": 162,
|
57 |
+
"45": 342,
|
58 |
+
"46": 299,
|
59 |
+
"47": 363,
|
60 |
+
"48": 60,
|
61 |
+
"49": 254,
|
62 |
+
"50": 299,
|
63 |
+
"51": 243,
|
64 |
+
"52": 379,
|
65 |
+
"53": 91,
|
66 |
+
"54": 364,
|
67 |
+
"55": 449,
|
68 |
+
"56": 412,
|
69 |
+
"57": 112,
|
70 |
+
"58": 265,
|
71 |
+
"59": 188,
|
72 |
+
"60": 55,
|
73 |
+
"61": 342,
|
74 |
+
"62": 311,
|
75 |
+
"63": 491
|
76 |
+
}
|
77 |
+
}
|
L4A/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 256,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 64,
|
5 |
+
"n_ov_heads": 32768,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 1024,
|
11 |
+
"layer": 4,
|
12 |
+
"model_name": "meta-llama/Llama-3.1-8B",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 128,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 22.783418655395508,
|
17 |
+
"out": 1.4010306596755981
|
18 |
+
},
|
19 |
+
"d_model": 4096,
|
20 |
+
"attn_scale": 11.313708498984761,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 256,
|
24 |
+
"rotary_base": 500000.0,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": true,
|
27 |
+
"NTK_by_parts_low_freq_factor": 1.0,
|
28 |
+
"NTK_by_parts_high_freq_factor": 4.0,
|
29 |
+
"NTK_by_parts_factor": 8.0,
|
30 |
+
"old_context_len": 8192
|
31 |
+
}
|
L4A/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f02d1943cbe67836523652be7175806ef2af2a3765bb3940864e6c58162c778
|
3 |
+
size 1614040466
|
L4A/metrics.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 7.74775e-05,
|
3 |
+
"explained_variance": 0.80203,
|
4 |
+
"l1": 7.87,
|
5 |
+
"ground_truth_norm": 1.413,
|
6 |
+
"reconstructed_norm": 1.301,
|
7 |
+
"error_norm": 0.5429,
|
8 |
+
"sparsity/below 1e-5": 18830,
|
9 |
+
"sparsity/below 1e-6": 18220,
|
10 |
+
"positivity": 0.45062,
|
11 |
+
"ov_head_live_count": {
|
12 |
+
"0": 327,
|
13 |
+
"1": 150,
|
14 |
+
"2": 366,
|
15 |
+
"3": 310,
|
16 |
+
"4": 107,
|
17 |
+
"5": 407,
|
18 |
+
"6": 80,
|
19 |
+
"7": 52,
|
20 |
+
"8": 210,
|
21 |
+
"9": 213,
|
22 |
+
"10": 228,
|
23 |
+
"11": 212,
|
24 |
+
"12": 214,
|
25 |
+
"13": 360,
|
26 |
+
"14": 273,
|
27 |
+
"15": 202,
|
28 |
+
"16": 174,
|
29 |
+
"17": 273,
|
30 |
+
"18": 45,
|
31 |
+
"19": 165,
|
32 |
+
"20": 194,
|
33 |
+
"21": 173,
|
34 |
+
"22": 177,
|
35 |
+
"23": 238,
|
36 |
+
"24": 383,
|
37 |
+
"25": 198,
|
38 |
+
"26": 193,
|
39 |
+
"27": 415,
|
40 |
+
"28": 297,
|
41 |
+
"29": 312,
|
42 |
+
"30": 251,
|
43 |
+
"31": 430,
|
44 |
+
"32": 184,
|
45 |
+
"33": 38,
|
46 |
+
"34": 152,
|
47 |
+
"35": 150,
|
48 |
+
"36": 224,
|
49 |
+
"37": 303,
|
50 |
+
"38": 221,
|
51 |
+
"39": 195,
|
52 |
+
"40": 148,
|
53 |
+
"41": 128,
|
54 |
+
"42": 71,
|
55 |
+
"43": 433,
|
56 |
+
"44": 277,
|
57 |
+
"45": 204,
|
58 |
+
"46": 249,
|
59 |
+
"47": 100,
|
60 |
+
"48": 126,
|
61 |
+
"49": 305,
|
62 |
+
"50": 243,
|
63 |
+
"51": 270,
|
64 |
+
"52": 236,
|
65 |
+
"53": 387,
|
66 |
+
"54": 272,
|
67 |
+
"55": 180,
|
68 |
+
"56": 161,
|
69 |
+
"57": 233,
|
70 |
+
"58": 341,
|
71 |
+
"59": 331,
|
72 |
+
"60": 264,
|
73 |
+
"61": 117,
|
74 |
+
"62": 292,
|
75 |
+
"63": 302
|
76 |
+
}
|
77 |
+
}
|