fnlp
/

Hzfinfdu commited on
Commit
d281fdc
·
verified ·
1 Parent(s): c14c1dc

Upload folder using huggingface_hub

Browse files
L10A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 10,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 30.592103958129883,
17
+ "out": 2.438781261444092
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L10A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:094eee9c4592e95407e9f48b1069bf6c721dc6c22979423ec9d1700099912dc1
3
+ size 1614040466
L10A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0004476196,
3
+ "explained_variance": 0.68912,
4
+ "l1": 13.8,
5
+ "ground_truth_norm": 2.459,
6
+ "reconstructed_norm": 2.072,
7
+ "error_norm": 1.3116,
8
+ "sparsity/below 1e-5": 21759,
9
+ "sparsity/below 1e-6": 21495,
10
+ "positivity": 0.37479,
11
+ "ov_head_live_count": {
12
+ "0": 49,
13
+ "1": 145,
14
+ "2": 259,
15
+ "3": 316,
16
+ "4": 384,
17
+ "5": 110,
18
+ "6": 168,
19
+ "7": 8,
20
+ "8": 391,
21
+ "9": 154,
22
+ "10": 264,
23
+ "11": 40,
24
+ "12": 263,
25
+ "13": 425,
26
+ "14": 173,
27
+ "15": 123,
28
+ "16": 143,
29
+ "17": 27,
30
+ "18": 324,
31
+ "19": 91,
32
+ "20": 100,
33
+ "21": 407,
34
+ "22": 269,
35
+ "23": 327,
36
+ "24": 190,
37
+ "25": 135,
38
+ "26": 113,
39
+ "27": 233,
40
+ "28": 202,
41
+ "29": 99,
42
+ "30": 95,
43
+ "31": 95,
44
+ "32": 245,
45
+ "33": 207,
46
+ "34": 240,
47
+ "35": 176,
48
+ "36": 39,
49
+ "37": 248,
50
+ "38": 266,
51
+ "39": 166,
52
+ "40": 352,
53
+ "41": 161,
54
+ "42": 184,
55
+ "43": 30,
56
+ "44": 171,
57
+ "45": 235,
58
+ "46": 263,
59
+ "47": 293,
60
+ "48": 150,
61
+ "49": 182,
62
+ "50": 168,
63
+ "51": 78,
64
+ "52": 334,
65
+ "53": 152,
66
+ "54": 29,
67
+ "55": 268,
68
+ "56": 255,
69
+ "57": 115,
70
+ "58": 203,
71
+ "59": 83,
72
+ "60": 312,
73
+ "61": 200,
74
+ "62": 124,
75
+ "63": 230
76
+ }
77
+ }
L11A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 11,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 30.33755874633789,
17
+ "out": 2.5108070373535156
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L11A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60ea504a3ae409da4785d240b6ecae94556544cb11117e0c4ae8eb04757b5b16
3
+ size 1614040466
L11A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0004975098,
3
+ "explained_variance": 0.67261,
4
+ "l1": 14.56,
5
+ "ground_truth_norm": 2.54,
6
+ "reconstructed_norm": 2.119,
7
+ "error_norm": 1.3917,
8
+ "sparsity/below 1e-5": 23567,
9
+ "sparsity/below 1e-6": 23293,
10
+ "positivity": 0.31964,
11
+ "ov_head_live_count": {
12
+ "0": 54,
13
+ "1": 218,
14
+ "2": 32,
15
+ "3": 116,
16
+ "4": 226,
17
+ "5": 107,
18
+ "6": 113,
19
+ "7": 66,
20
+ "8": 117,
21
+ "9": 126,
22
+ "10": 218,
23
+ "11": 220,
24
+ "12": 138,
25
+ "13": 91,
26
+ "14": 369,
27
+ "15": 360,
28
+ "16": 102,
29
+ "17": 109,
30
+ "18": 187,
31
+ "19": 299,
32
+ "20": 81,
33
+ "21": 326,
34
+ "22": 207,
35
+ "23": 195,
36
+ "24": 177,
37
+ "25": 22,
38
+ "26": 277,
39
+ "27": 41,
40
+ "28": 132,
41
+ "29": 80,
42
+ "30": 224,
43
+ "31": 370,
44
+ "32": 88,
45
+ "33": 103,
46
+ "34": 172,
47
+ "35": 109,
48
+ "36": 311,
49
+ "37": 48,
50
+ "38": 173,
51
+ "39": 43,
52
+ "40": 212,
53
+ "41": 41,
54
+ "42": 271,
55
+ "43": 69,
56
+ "44": 371,
57
+ "45": 61,
58
+ "46": 221,
59
+ "47": 152,
60
+ "48": 54,
61
+ "49": 50,
62
+ "50": 160,
63
+ "51": 178,
64
+ "52": 298,
65
+ "53": 289,
66
+ "54": 371,
67
+ "55": 44,
68
+ "56": 29,
69
+ "57": 151,
70
+ "58": 73,
71
+ "59": 101,
72
+ "60": 54,
73
+ "61": 191,
74
+ "62": 165,
75
+ "63": 421
76
+ }
77
+ }
L5A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 5,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 26.44734764099121,
17
+ "out": 1.6114155054092407
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L5A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03bc0ab9a7d5b6729116001d53958d2cd20264e979fc505f9ef9b0ca997997c9
3
+ size 1614040466
L5A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0001057259,
3
+ "explained_variance": 0.80751,
4
+ "l1": 9.0,
5
+ "ground_truth_norm": 1.624,
6
+ "reconstructed_norm": 1.493,
7
+ "error_norm": 0.6327,
8
+ "sparsity/below 1e-5": 21411,
9
+ "sparsity/below 1e-6": 21180,
10
+ "positivity": 0.36496,
11
+ "ov_head_live_count": {
12
+ "0": 122,
13
+ "1": 320,
14
+ "2": 352,
15
+ "3": 106,
16
+ "4": 241,
17
+ "5": 361,
18
+ "6": 204,
19
+ "7": 160,
20
+ "8": 68,
21
+ "9": 70,
22
+ "10": 225,
23
+ "11": 66,
24
+ "12": 109,
25
+ "13": 191,
26
+ "14": 337,
27
+ "15": 305,
28
+ "16": 251,
29
+ "17": 59,
30
+ "18": 287,
31
+ "19": 268,
32
+ "20": 111,
33
+ "21": 43,
34
+ "22": 97,
35
+ "23": 46,
36
+ "24": 244,
37
+ "25": 147,
38
+ "26": 47,
39
+ "27": 70,
40
+ "28": 50,
41
+ "29": 39,
42
+ "30": 285,
43
+ "31": 138,
44
+ "32": 292,
45
+ "33": 335,
46
+ "34": 353,
47
+ "35": 46,
48
+ "36": 64,
49
+ "37": 188,
50
+ "38": 26,
51
+ "39": 271,
52
+ "40": 81,
53
+ "41": 171,
54
+ "42": 358,
55
+ "43": 313,
56
+ "44": 182,
57
+ "45": 363,
58
+ "46": 316,
59
+ "47": 138,
60
+ "48": 216,
61
+ "49": 98,
62
+ "50": 177,
63
+ "51": 40,
64
+ "52": 308,
65
+ "53": 246,
66
+ "54": 152,
67
+ "55": 83,
68
+ "56": 306,
69
+ "57": 239,
70
+ "58": 44,
71
+ "59": 343,
72
+ "60": 28,
73
+ "61": 189,
74
+ "62": 297,
75
+ "63": 277
76
+ }
77
+ }
L6A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 6,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 25.502641677856445,
17
+ "out": 1.9176710844039917
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L6A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:621f8ec5dbb7dbd634184b176cabb3dcecf8292ac1d78036c549c45f471deca7
3
+ size 1614040466
L6A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0001995308,
3
+ "explained_variance": 0.75204,
4
+ "l1": 10.84,
5
+ "ground_truth_norm": 1.939,
6
+ "reconstructed_norm": 1.726,
7
+ "error_norm": 0.8747,
8
+ "sparsity/below 1e-5": 20505,
9
+ "sparsity/below 1e-6": 20331,
10
+ "positivity": 0.38525,
11
+ "ov_head_live_count": {
12
+ "0": 181,
13
+ "1": 67,
14
+ "2": 66,
15
+ "3": 122,
16
+ "4": 144,
17
+ "5": 143,
18
+ "6": 396,
19
+ "7": 241,
20
+ "8": 379,
21
+ "9": 287,
22
+ "10": 177,
23
+ "11": 247,
24
+ "12": 112,
25
+ "13": 163,
26
+ "14": 190,
27
+ "15": 199,
28
+ "16": 252,
29
+ "17": 357,
30
+ "18": 140,
31
+ "19": 272,
32
+ "20": 202,
33
+ "21": 146,
34
+ "22": 121,
35
+ "23": 24,
36
+ "24": 301,
37
+ "25": 187,
38
+ "26": 194,
39
+ "27": 281,
40
+ "28": 40,
41
+ "29": 48,
42
+ "30": 197,
43
+ "31": 84,
44
+ "32": 379,
45
+ "33": 174,
46
+ "34": 73,
47
+ "35": 190,
48
+ "36": 225,
49
+ "37": 395,
50
+ "38": 289,
51
+ "39": 299,
52
+ "40": 339,
53
+ "41": 112,
54
+ "42": 67,
55
+ "43": 350,
56
+ "44": 219,
57
+ "45": 98,
58
+ "46": 202,
59
+ "47": 149,
60
+ "48": 176,
61
+ "49": 324,
62
+ "50": 330,
63
+ "51": 338,
64
+ "52": 301,
65
+ "53": 41,
66
+ "54": 121,
67
+ "55": 205,
68
+ "56": 53,
69
+ "57": 190,
70
+ "58": 115,
71
+ "59": 195,
72
+ "60": 142,
73
+ "61": 86,
74
+ "62": 257,
75
+ "63": 230
76
+ }
77
+ }
L7A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 7,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 25.808958053588867,
17
+ "out": 2.3595995903015137
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L7A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6de3fcf7114df285a40bf94da6ccba859b968950ec4872e641c53322adc8c50b
3
+ size 1614040466
L7A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0003076273,
3
+ "explained_variance": 0.75009,
4
+ "l1": 13.16,
5
+ "ground_truth_norm": 2.385,
6
+ "reconstructed_norm": 2.113,
7
+ "error_norm": 1.0933,
8
+ "sparsity/below 1e-5": 21840,
9
+ "sparsity/below 1e-6": 21729,
10
+ "positivity": 0.34482,
11
+ "ov_head_live_count": {
12
+ "0": 52,
13
+ "1": 259,
14
+ "2": 94,
15
+ "3": 135,
16
+ "4": 109,
17
+ "5": 135,
18
+ "6": 198,
19
+ "7": 257,
20
+ "8": 121,
21
+ "9": 185,
22
+ "10": 377,
23
+ "11": 57,
24
+ "12": 218,
25
+ "13": 168,
26
+ "14": 228,
27
+ "15": 128,
28
+ "16": 216,
29
+ "17": 98,
30
+ "18": 232,
31
+ "19": 132,
32
+ "20": 324,
33
+ "21": 206,
34
+ "22": 194,
35
+ "23": 66,
36
+ "24": 275,
37
+ "25": 158,
38
+ "26": 160,
39
+ "27": 238,
40
+ "28": 238,
41
+ "29": 60,
42
+ "30": 167,
43
+ "31": 215,
44
+ "32": 291,
45
+ "33": 337,
46
+ "34": 124,
47
+ "35": 84,
48
+ "36": 140,
49
+ "37": 230,
50
+ "38": 100,
51
+ "39": 140,
52
+ "40": 262,
53
+ "41": 368,
54
+ "42": 92,
55
+ "43": 87,
56
+ "44": 287,
57
+ "45": 202,
58
+ "46": 182,
59
+ "47": 94,
60
+ "48": 362,
61
+ "49": 143,
62
+ "50": 156,
63
+ "51": 44,
64
+ "52": 97,
65
+ "53": 137,
66
+ "54": 168,
67
+ "55": 110,
68
+ "56": 75,
69
+ "57": 151,
70
+ "58": 41,
71
+ "59": 271,
72
+ "60": 160,
73
+ "61": 208,
74
+ "62": 112,
75
+ "63": 344
76
+ }
77
+ }
L8A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 8,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 28.46880340576172,
17
+ "out": 2.3348097801208496
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L8A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68d209ff696f96aea135ee110e147f143d4ceadb21040d1733f90cdb23fc13ab
3
+ size 1614040466
L8A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0003913084,
3
+ "explained_variance": 0.69221,
4
+ "l1": 13.56,
5
+ "ground_truth_norm": 2.347,
6
+ "reconstructed_norm": 1.988,
7
+ "error_norm": 1.2371,
8
+ "sparsity/below 1e-5": 23158,
9
+ "sparsity/below 1e-6": 23050,
10
+ "positivity": 0.30444,
11
+ "ov_head_live_count": {
12
+ "0": 284,
13
+ "1": 136,
14
+ "2": 368,
15
+ "3": 27,
16
+ "4": 14,
17
+ "5": 211,
18
+ "6": 358,
19
+ "7": 97,
20
+ "8": 10,
21
+ "9": 250,
22
+ "10": 448,
23
+ "11": 145,
24
+ "12": 4,
25
+ "13": 21,
26
+ "14": 104,
27
+ "15": 44,
28
+ "16": 43,
29
+ "17": 104,
30
+ "18": 304,
31
+ "19": 207,
32
+ "20": 74,
33
+ "21": 51,
34
+ "22": 162,
35
+ "23": 33,
36
+ "24": 107,
37
+ "25": 52,
38
+ "26": 322,
39
+ "27": 148,
40
+ "28": 116,
41
+ "29": 248,
42
+ "30": 323,
43
+ "31": 203,
44
+ "32": 220,
45
+ "33": 146,
46
+ "34": 273,
47
+ "35": 357,
48
+ "36": 17,
49
+ "37": 312,
50
+ "38": 24,
51
+ "39": 104,
52
+ "40": 126,
53
+ "41": 11,
54
+ "42": 53,
55
+ "43": 27,
56
+ "44": 42,
57
+ "45": 214,
58
+ "46": 240,
59
+ "47": 110,
60
+ "48": 226,
61
+ "49": 156,
62
+ "50": 256,
63
+ "51": 234,
64
+ "52": 269,
65
+ "53": 40,
66
+ "54": 113,
67
+ "55": 41,
68
+ "56": 52,
69
+ "57": 112,
70
+ "58": 112,
71
+ "59": 319,
72
+ "60": 197,
73
+ "61": 299,
74
+ "62": 156,
75
+ "63": 100
76
+ }
77
+ }
L9A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 9,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 28.447616577148438,
17
+ "out": 2.536945104598999
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L9A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6780ce060cdb2d3c2b35e90009c10fcf451d009b0f2cb7cf98dae5dbf92cfedc
3
+ size 1614040466
L9A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0004761537,
3
+ "explained_variance": 0.69271,
4
+ "l1": 14.97,
5
+ "ground_truth_norm": 2.561,
6
+ "reconstructed_norm": 2.163,
7
+ "error_norm": 1.3627,
8
+ "sparsity/below 1e-5": 23308,
9
+ "sparsity/below 1e-6": 23088,
10
+ "positivity": 0.31,
11
+ "ov_head_live_count": {
12
+ "0": 44,
13
+ "1": 102,
14
+ "2": 224,
15
+ "3": 101,
16
+ "4": 4,
17
+ "5": 348,
18
+ "6": 180,
19
+ "7": 21,
20
+ "8": 94,
21
+ "9": 57,
22
+ "10": 8,
23
+ "11": 295,
24
+ "12": 114,
25
+ "13": 187,
26
+ "14": 408,
27
+ "15": 79,
28
+ "16": 453,
29
+ "17": 95,
30
+ "18": 17,
31
+ "19": 198,
32
+ "20": 178,
33
+ "21": 115,
34
+ "22": 20,
35
+ "23": 73,
36
+ "24": 154,
37
+ "25": 192,
38
+ "26": 83,
39
+ "27": 174,
40
+ "28": 148,
41
+ "29": 214,
42
+ "30": 159,
43
+ "31": 222,
44
+ "32": 48,
45
+ "33": 193,
46
+ "34": 231,
47
+ "35": 418,
48
+ "36": 48,
49
+ "37": 51,
50
+ "38": 300,
51
+ "39": 426,
52
+ "40": 125,
53
+ "41": 193,
54
+ "42": 133,
55
+ "43": 252,
56
+ "44": 114,
57
+ "45": 198,
58
+ "46": 207,
59
+ "47": 302,
60
+ "48": 144,
61
+ "49": 38,
62
+ "50": 36,
63
+ "51": 87,
64
+ "52": 394,
65
+ "53": 121,
66
+ "54": 322,
67
+ "55": 14,
68
+ "56": 3,
69
+ "57": 147,
70
+ "58": 299,
71
+ "59": 110,
72
+ "60": 176,
73
+ "61": 30,
74
+ "62": 10,
75
+ "63": 227
76
+ }
77
+ }