fnlp
/

Hzfinfdu commited on
Commit
7c219b2
·
verified ·
1 Parent(s): 60a1fdf

Upload folder using huggingface_hub

Browse files
L24A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 24,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 34.08556365966797,
17
+ "out": 2.8631839752197266
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L24A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6db3ce7561a7747e867823abd107d96cf142bebaa37a081f2454dbf4a8b7fcaf
3
+ size 1614040466
L24A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0004386434,
3
+ "explained_variance": 0.79569,
4
+ "l1": 19.61,
5
+ "ground_truth_norm": 2.987,
6
+ "reconstructed_norm": 2.709,
7
+ "error_norm": 1.2139,
8
+ "sparsity/below 1e-5": 16085,
9
+ "sparsity/below 1e-6": 15078,
10
+ "positivity": 0.57111,
11
+ "ov_head_live_count": {
12
+ "0": 161,
13
+ "1": 392,
14
+ "2": 260,
15
+ "3": 443,
16
+ "4": 449,
17
+ "5": 457,
18
+ "6": 462,
19
+ "7": 136,
20
+ "8": 508,
21
+ "9": 210,
22
+ "10": 240,
23
+ "11": 451,
24
+ "12": 454,
25
+ "13": 33,
26
+ "14": 90,
27
+ "15": 355,
28
+ "16": 500,
29
+ "17": 459,
30
+ "18": 191,
31
+ "19": 176,
32
+ "20": 402,
33
+ "21": 161,
34
+ "22": 468,
35
+ "23": 480,
36
+ "24": 43,
37
+ "25": 325,
38
+ "26": 437,
39
+ "27": 230,
40
+ "28": 113,
41
+ "29": 332,
42
+ "30": 326,
43
+ "31": 304,
44
+ "32": 320,
45
+ "33": 356,
46
+ "34": 268,
47
+ "35": 264,
48
+ "36": 464,
49
+ "37": 388,
50
+ "38": 121,
51
+ "39": 113,
52
+ "40": 40,
53
+ "41": 479,
54
+ "42": 221,
55
+ "43": 350,
56
+ "44": 243,
57
+ "45": 249,
58
+ "46": 389,
59
+ "47": 487,
60
+ "48": 447,
61
+ "49": 372,
62
+ "50": 140,
63
+ "51": 136,
64
+ "52": 201,
65
+ "53": 23,
66
+ "54": 403,
67
+ "55": 309,
68
+ "56": 178,
69
+ "57": 57,
70
+ "58": 63,
71
+ "59": 450,
72
+ "60": 436,
73
+ "61": 298,
74
+ "62": 75,
75
+ "63": 326
76
+ }
77
+ }
L25A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 25,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 34.21133041381836,
17
+ "out": 3.386951446533203
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L25A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:933b326e066a790ab90883a761404a4cc32aa10a93a77d256e2dc9132e6d6412
3
+ size 1614040466
L25A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0005801773,
3
+ "explained_variance": 0.79763,
4
+ "l1": 23.5,
5
+ "ground_truth_norm": 3.534,
6
+ "reconstructed_norm": 3.213,
7
+ "error_norm": 1.431,
8
+ "sparsity/below 1e-5": 16451,
9
+ "sparsity/below 1e-6": 15619,
10
+ "positivity": 0.53839,
11
+ "ov_head_live_count": {
12
+ "0": 366,
13
+ "1": 263,
14
+ "2": 337,
15
+ "3": 308,
16
+ "4": 359,
17
+ "5": 371,
18
+ "6": 224,
19
+ "7": 307,
20
+ "8": 37,
21
+ "9": 269,
22
+ "10": 365,
23
+ "11": 409,
24
+ "12": 398,
25
+ "13": 477,
26
+ "14": 369,
27
+ "15": 145,
28
+ "16": 58,
29
+ "17": 368,
30
+ "18": 377,
31
+ "19": 368,
32
+ "20": 250,
33
+ "21": 426,
34
+ "22": 316,
35
+ "23": 368,
36
+ "24": 271,
37
+ "25": 39,
38
+ "26": 258,
39
+ "27": 440,
40
+ "28": 274,
41
+ "29": 230,
42
+ "30": 311,
43
+ "31": 269,
44
+ "32": 6,
45
+ "33": 404,
46
+ "34": 453,
47
+ "35": 364,
48
+ "36": 378,
49
+ "37": 369,
50
+ "38": 153,
51
+ "39": 75,
52
+ "40": 94,
53
+ "41": 408,
54
+ "42": 438,
55
+ "43": 327,
56
+ "44": 318,
57
+ "45": 310,
58
+ "46": 361,
59
+ "47": 310,
60
+ "48": 338,
61
+ "49": 9,
62
+ "50": 10,
63
+ "51": 246,
64
+ "52": 98,
65
+ "53": 261,
66
+ "54": 148,
67
+ "55": 333,
68
+ "56": 475,
69
+ "57": 480,
70
+ "58": 94,
71
+ "59": 122,
72
+ "60": 239,
73
+ "61": 279,
74
+ "62": 83,
75
+ "63": 32
76
+ }
77
+ }
L26A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 26,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 33.632904052734375,
17
+ "out": 4.672374248504639
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L26A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aef09c960771267a2eb513c33913bb059e799af318d1b94810d16350c0f4f0f1
3
+ size 1614040466
L26A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0009872384,
3
+ "explained_variance": 0.75272,
4
+ "l1": 25.33,
5
+ "ground_truth_norm": 4.741,
6
+ "reconstructed_norm": 4.321,
7
+ "error_norm": 1.8952,
8
+ "sparsity/below 1e-5": 18803,
9
+ "sparsity/below 1e-6": 18155,
10
+ "positivity": 0.4664,
11
+ "ov_head_live_count": {
12
+ "0": 410,
13
+ "1": 101,
14
+ "2": 471,
15
+ "3": 382,
16
+ "4": 104,
17
+ "5": 92,
18
+ "6": 113,
19
+ "7": 199,
20
+ "8": 377,
21
+ "9": 436,
22
+ "10": 144,
23
+ "11": 249,
24
+ "12": 367,
25
+ "13": 293,
26
+ "14": 71,
27
+ "15": 333,
28
+ "16": 290,
29
+ "17": 322,
30
+ "18": 273,
31
+ "19": 243,
32
+ "20": 438,
33
+ "21": 65,
34
+ "22": 228,
35
+ "23": 222,
36
+ "24": 309,
37
+ "25": 65,
38
+ "26": 74,
39
+ "27": 392,
40
+ "28": 381,
41
+ "29": 390,
42
+ "30": 286,
43
+ "31": 77,
44
+ "32": 324,
45
+ "33": 404,
46
+ "34": 32,
47
+ "35": 5,
48
+ "36": 167,
49
+ "37": 183,
50
+ "38": 37,
51
+ "39": 251,
52
+ "40": 112,
53
+ "41": 382,
54
+ "42": 188,
55
+ "43": 145,
56
+ "44": 279,
57
+ "45": 372,
58
+ "46": 123,
59
+ "47": 182,
60
+ "48": 66,
61
+ "49": 98,
62
+ "50": 242,
63
+ "51": 251,
64
+ "52": 265,
65
+ "53": 350,
66
+ "54": 346,
67
+ "55": 26,
68
+ "56": 351,
69
+ "57": 360,
70
+ "58": 80,
71
+ "59": 305,
72
+ "60": 409,
73
+ "61": 305,
74
+ "62": 366,
75
+ "63": 80
76
+ }
77
+ }
L27A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 27,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 34.473514556884766,
17
+ "out": 4.5733642578125
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L27A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af816145e6fec79bf175936f6112b908abd3a85634518d54cd7b0195211daed3
3
+ size 1614040466
L27A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0010833642,
3
+ "explained_variance": 0.76039,
4
+ "l1": 28.78,
5
+ "ground_truth_norm": 4.677,
6
+ "reconstructed_norm": 4.221,
7
+ "error_norm": 1.9506,
8
+ "sparsity/below 1e-5": 17314,
9
+ "sparsity/below 1e-6": 16541,
10
+ "positivity": 0.51071,
11
+ "ov_head_live_count": {
12
+ "0": 227,
13
+ "1": 424,
14
+ "2": 132,
15
+ "3": 260,
16
+ "4": 447,
17
+ "5": 510,
18
+ "6": 381,
19
+ "7": 470,
20
+ "8": 40,
21
+ "9": 354,
22
+ "10": 225,
23
+ "11": 161,
24
+ "12": 448,
25
+ "13": 34,
26
+ "14": 321,
27
+ "15": 437,
28
+ "16": 308,
29
+ "17": 438,
30
+ "18": 199,
31
+ "19": 29,
32
+ "20": 60,
33
+ "21": 189,
34
+ "22": 264,
35
+ "23": 453,
36
+ "24": 295,
37
+ "25": 468,
38
+ "26": 194,
39
+ "27": 176,
40
+ "28": 36,
41
+ "29": 330,
42
+ "30": 257,
43
+ "31": 65,
44
+ "32": 228,
45
+ "33": 179,
46
+ "34": 299,
47
+ "35": 284,
48
+ "36": 221,
49
+ "37": 322,
50
+ "38": 57,
51
+ "39": 468,
52
+ "40": 103,
53
+ "41": 394,
54
+ "42": 377,
55
+ "43": 329,
56
+ "44": 65,
57
+ "45": 346,
58
+ "46": 292,
59
+ "47": 384,
60
+ "48": 62,
61
+ "49": 275,
62
+ "50": 331,
63
+ "51": 3,
64
+ "52": 244,
65
+ "53": 387,
66
+ "54": 38,
67
+ "55": 425,
68
+ "56": 449,
69
+ "57": 9,
70
+ "58": 452,
71
+ "59": 228,
72
+ "60": 36,
73
+ "61": 189,
74
+ "62": 295,
75
+ "63": 332
76
+ }
77
+ }
L28A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 28,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 34.196075439453125,
17
+ "out": 5.965426921844482
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L28A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0793015d991e07b523b99ba4bccf68aaea52a77eb65b00fbe2a7df4de3390a03
3
+ size 1614040466
L28A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0016028101,
3
+ "explained_variance": 0.71255,
4
+ "l1": 31.98,
5
+ "ground_truth_norm": 5.944,
6
+ "reconstructed_norm": 5.393,
7
+ "error_norm": 2.4326,
8
+ "sparsity/below 1e-5": 19217,
9
+ "sparsity/below 1e-6": 18760,
10
+ "positivity": 0.43726,
11
+ "ov_head_live_count": {
12
+ "0": 408,
13
+ "1": 166,
14
+ "2": 355,
15
+ "3": 215,
16
+ "4": 481,
17
+ "5": 54,
18
+ "6": 49,
19
+ "7": 319,
20
+ "8": 415,
21
+ "9": 85,
22
+ "10": 13,
23
+ "11": 127,
24
+ "12": 454,
25
+ "13": 319,
26
+ "14": 307,
27
+ "15": 289,
28
+ "16": 161,
29
+ "17": 101,
30
+ "18": 60,
31
+ "19": 238,
32
+ "20": 423,
33
+ "21": 257,
34
+ "22": 165,
35
+ "23": 388,
36
+ "24": 89,
37
+ "25": 10,
38
+ "26": 164,
39
+ "27": 39,
40
+ "28": 0,
41
+ "29": 371,
42
+ "30": 122,
43
+ "31": 327,
44
+ "32": 210,
45
+ "33": 419,
46
+ "34": 164,
47
+ "35": 433,
48
+ "36": 130,
49
+ "37": 118,
50
+ "38": 94,
51
+ "39": 92,
52
+ "40": 6,
53
+ "41": 417,
54
+ "42": 64,
55
+ "43": 369,
56
+ "44": 343,
57
+ "45": 235,
58
+ "46": 422,
59
+ "47": 5,
60
+ "48": 294,
61
+ "49": 234,
62
+ "50": 357,
63
+ "51": 206,
64
+ "52": 0,
65
+ "53": 343,
66
+ "54": 396,
67
+ "55": 230,
68
+ "56": 366,
69
+ "57": 47,
70
+ "58": 356,
71
+ "59": 19,
72
+ "60": 133,
73
+ "61": 13,
74
+ "62": 450,
75
+ "63": 392
76
+ }
77
+ }
L29A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 29,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 34.094024658203125,
17
+ "out": 7.520937442779541
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L29A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52d1a387433aafd70e2a1ab578ee7d01298f6f12f5b2bd6ed70972ae4297fcaf
3
+ size 1614040466
L29A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.001713933,
3
+ "explained_variance": 0.78354,
4
+ "l1": 39.04,
5
+ "ground_truth_norm": 7.541,
6
+ "reconstructed_norm": 7.079,
7
+ "error_norm": 2.4444,
8
+ "sparsity/below 1e-5": 23002,
9
+ "sparsity/below 1e-6": 22114,
10
+ "positivity": 0.34143,
11
+ "ov_head_live_count": {
12
+ "0": 432,
13
+ "1": 236,
14
+ "2": 18,
15
+ "3": 220,
16
+ "4": 390,
17
+ "5": 302,
18
+ "6": 408,
19
+ "7": 91,
20
+ "8": 109,
21
+ "9": 58,
22
+ "10": 49,
23
+ "11": 303,
24
+ "12": 239,
25
+ "13": 217,
26
+ "14": 196,
27
+ "15": 49,
28
+ "16": 186,
29
+ "17": 30,
30
+ "18": 155,
31
+ "19": 184,
32
+ "20": 143,
33
+ "21": 190,
34
+ "22": 235,
35
+ "23": 46,
36
+ "24": 227,
37
+ "25": 211,
38
+ "26": 299,
39
+ "27": 146,
40
+ "28": 306,
41
+ "29": 113,
42
+ "30": 117,
43
+ "31": 307,
44
+ "32": 60,
45
+ "33": 283,
46
+ "34": 71,
47
+ "35": 18,
48
+ "36": 66,
49
+ "37": 86,
50
+ "38": 243,
51
+ "39": 90,
52
+ "40": 72,
53
+ "41": 359,
54
+ "42": 68,
55
+ "43": 426,
56
+ "44": 210,
57
+ "45": 424,
58
+ "46": 91,
59
+ "47": 29,
60
+ "48": 99,
61
+ "49": 0,
62
+ "50": 7,
63
+ "51": 348,
64
+ "52": 216,
65
+ "53": 186,
66
+ "54": 120,
67
+ "55": 191,
68
+ "56": 25,
69
+ "57": 340,
70
+ "58": 365,
71
+ "59": 181,
72
+ "60": 64,
73
+ "61": 28,
74
+ "62": 201,
75
+ "63": 9
76
+ }
77
+ }
L30A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 30,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 34.17445755004883,
17
+ "out": 11.095989227294922
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L30A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b36319a4cd58d2ee25c7cf90a2aa5ef3f57440bda6cb47e965974e10b460c119
3
+ size 1614040466
L30A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0040983707,
3
+ "explained_variance": 0.73162,
4
+ "l1": 48.05,
5
+ "ground_truth_norm": 11.015,
6
+ "reconstructed_norm": 10.267,
7
+ "error_norm": 3.8362,
8
+ "sparsity/below 1e-5": 22716,
9
+ "sparsity/below 1e-6": 22018,
10
+ "positivity": 0.34238,
11
+ "ov_head_live_count": {
12
+ "0": 255,
13
+ "1": 216,
14
+ "2": 0,
15
+ "3": 231,
16
+ "4": 172,
17
+ "5": 342,
18
+ "6": 250,
19
+ "7": 249,
20
+ "8": 107,
21
+ "9": 65,
22
+ "10": 4,
23
+ "11": 465,
24
+ "12": 5,
25
+ "13": 12,
26
+ "14": 161,
27
+ "15": 305,
28
+ "16": 38,
29
+ "17": 86,
30
+ "18": 163,
31
+ "19": 153,
32
+ "20": 127,
33
+ "21": 94,
34
+ "22": 157,
35
+ "23": 106,
36
+ "24": 303,
37
+ "25": 340,
38
+ "26": 143,
39
+ "27": 346,
40
+ "28": 258,
41
+ "29": 335,
42
+ "30": 46,
43
+ "31": 123,
44
+ "32": 17,
45
+ "33": 99,
46
+ "34": 69,
47
+ "35": 30,
48
+ "36": 261,
49
+ "37": 61,
50
+ "38": 83,
51
+ "39": 34,
52
+ "40": 64,
53
+ "41": 84,
54
+ "42": 35,
55
+ "43": 236,
56
+ "44": 143,
57
+ "45": 397,
58
+ "46": 131,
59
+ "47": 218,
60
+ "48": 452,
61
+ "49": 478,
62
+ "50": 113,
63
+ "51": 141,
64
+ "52": 342,
65
+ "53": 364,
66
+ "54": 82,
67
+ "55": 4,
68
+ "56": 142,
69
+ "57": 33,
70
+ "58": 265,
71
+ "59": 318,
72
+ "60": 433,
73
+ "61": 29,
74
+ "62": 21,
75
+ "63": 383
76
+ }
77
+ }
L31A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 31,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 28.58884620666504,
17
+ "out": 18.207286834716797
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L31A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62402678ec4e8895918806cf95462f9edb8cbf9f31a8bc755f54bcd8ff8e6609
3
+ size 1614040466
L31A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0050583091,
3
+ "explained_variance": 0.83675,
4
+ "l1": 97.56,
5
+ "ground_truth_norm": 18.24,
6
+ "reconstructed_norm": 17.639,
7
+ "error_norm": 4.4254,
8
+ "sparsity/below 1e-5": 24917,
9
+ "sparsity/below 1e-6": 24728,
10
+ "positivity": 0.24741,
11
+ "ov_head_live_count": {
12
+ "0": 189,
13
+ "1": 3,
14
+ "2": 159,
15
+ "3": 82,
16
+ "4": 71,
17
+ "5": 139,
18
+ "6": 112,
19
+ "7": 353,
20
+ "8": 257,
21
+ "9": 168,
22
+ "10": 48,
23
+ "11": 46,
24
+ "12": 184,
25
+ "13": 92,
26
+ "14": 243,
27
+ "15": 2,
28
+ "16": 180,
29
+ "17": 233,
30
+ "18": 65,
31
+ "19": 127,
32
+ "20": 124,
33
+ "21": 105,
34
+ "22": 226,
35
+ "23": 6,
36
+ "24": 107,
37
+ "25": 131,
38
+ "26": 201,
39
+ "27": 114,
40
+ "28": 144,
41
+ "29": 108,
42
+ "30": 126,
43
+ "31": 74,
44
+ "32": 41,
45
+ "33": 219,
46
+ "34": 188,
47
+ "35": 23,
48
+ "36": 156,
49
+ "37": 215,
50
+ "38": 69,
51
+ "39": 134,
52
+ "40": 50,
53
+ "41": 19,
54
+ "42": 59,
55
+ "43": 133,
56
+ "44": 63,
57
+ "45": 111,
58
+ "46": 264,
59
+ "47": 65,
60
+ "48": 99,
61
+ "49": 181,
62
+ "50": 128,
63
+ "51": 239,
64
+ "52": 80,
65
+ "53": 139,
66
+ "54": 76,
67
+ "55": 190,
68
+ "56": 64,
69
+ "57": 91,
70
+ "58": 179,
71
+ "59": 34,
72
+ "60": 149,
73
+ "61": 143,
74
+ "62": 111,
75
+ "63": 176
76
+ }
77
+ }