fnlp
/

Hzfinfdu commited on
Commit
60a1fdf
·
verified ·
1 Parent(s): d281fdc

Upload folder using huggingface_hub

Browse files
L12A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 12,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 27.36377716064453,
17
+ "out": 2.912698745727539
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L12A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e94c9348a6c6e4382ce1f141a3be0fead61c0c81f073144900e0bac8692570be
3
+ size 1614040466
L12A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0006444874,
3
+ "explained_variance": 0.65362,
4
+ "l1": 16.72,
5
+ "ground_truth_norm": 2.933,
6
+ "reconstructed_norm": 2.46,
7
+ "error_norm": 1.5802,
8
+ "sparsity/below 1e-5": 24648,
9
+ "sparsity/below 1e-6": 24548,
10
+ "positivity": 0.2558,
11
+ "ov_head_live_count": {
12
+ "0": 113,
13
+ "1": 3,
14
+ "2": 28,
15
+ "3": 61,
16
+ "4": 361,
17
+ "5": 79,
18
+ "6": 180,
19
+ "7": 20,
20
+ "8": 215,
21
+ "9": 12,
22
+ "10": 97,
23
+ "11": 136,
24
+ "12": 146,
25
+ "13": 89,
26
+ "14": 13,
27
+ "15": 30,
28
+ "16": 5,
29
+ "17": 20,
30
+ "18": 183,
31
+ "19": 7,
32
+ "20": 191,
33
+ "21": 216,
34
+ "22": 2,
35
+ "23": 21,
36
+ "24": 166,
37
+ "25": 104,
38
+ "26": 334,
39
+ "27": 418,
40
+ "28": 368,
41
+ "29": 35,
42
+ "30": 306,
43
+ "31": 383,
44
+ "32": 166,
45
+ "33": 394,
46
+ "34": 129,
47
+ "35": 64,
48
+ "36": 418,
49
+ "37": 29,
50
+ "38": 10,
51
+ "39": 130,
52
+ "40": 122,
53
+ "41": 175,
54
+ "42": 8,
55
+ "43": 124,
56
+ "44": 15,
57
+ "45": 20,
58
+ "46": 13,
59
+ "47": 198,
60
+ "48": 308,
61
+ "49": 141,
62
+ "50": 27,
63
+ "51": 166,
64
+ "52": 85,
65
+ "53": 8,
66
+ "54": 247,
67
+ "55": 142,
68
+ "56": 31,
69
+ "57": 18,
70
+ "58": 47,
71
+ "59": 80,
72
+ "60": 277,
73
+ "61": 73,
74
+ "62": 157,
75
+ "63": 218
76
+ }
77
+ }
L13A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 13,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 31.526508331298828,
17
+ "out": 3.1382012367248535
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L13A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8bf691e58b635c9c5a20488926004b6c872f6a4a52693fb278b2f00f5f96a80
3
+ size 1614040466
L13A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0007835636,
3
+ "explained_variance": 0.68137,
4
+ "l1": 18.08,
5
+ "ground_truth_norm": 3.16,
6
+ "reconstructed_norm": 2.63,
7
+ "error_norm": 1.7362,
8
+ "sparsity/below 1e-5": 25078,
9
+ "sparsity/below 1e-6": 25001,
10
+ "positivity": 0.24667,
11
+ "ov_head_live_count": {
12
+ "0": 180,
13
+ "1": 68,
14
+ "2": 279,
15
+ "3": 133,
16
+ "4": 12,
17
+ "5": 123,
18
+ "6": 120,
19
+ "7": 55,
20
+ "8": 189,
21
+ "9": 117,
22
+ "10": 410,
23
+ "11": 41,
24
+ "12": 171,
25
+ "13": 12,
26
+ "14": 35,
27
+ "15": 255,
28
+ "16": 40,
29
+ "17": 3,
30
+ "18": 9,
31
+ "19": 38,
32
+ "20": 207,
33
+ "21": 354,
34
+ "22": 223,
35
+ "23": 450,
36
+ "24": 27,
37
+ "25": 31,
38
+ "26": 65,
39
+ "27": 21,
40
+ "28": 217,
41
+ "29": 63,
42
+ "30": 278,
43
+ "31": 25,
44
+ "32": 197,
45
+ "33": 368,
46
+ "34": 101,
47
+ "35": 64,
48
+ "36": 310,
49
+ "37": 32,
50
+ "38": 174,
51
+ "39": 286,
52
+ "40": 109,
53
+ "41": 101,
54
+ "42": 7,
55
+ "43": 147,
56
+ "44": 26,
57
+ "45": 38,
58
+ "46": 55,
59
+ "47": 7,
60
+ "48": 351,
61
+ "49": 93,
62
+ "50": 10,
63
+ "51": 14,
64
+ "52": 3,
65
+ "53": 179,
66
+ "54": 129,
67
+ "55": 182,
68
+ "56": 23,
69
+ "57": 65,
70
+ "58": 37,
71
+ "59": 180,
72
+ "60": 82,
73
+ "61": 117,
74
+ "62": 37,
75
+ "63": 308
76
+ }
77
+ }
L14A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 14,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 32.4897575378418,
17
+ "out": 3.191192865371704
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L14A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57feb9549ec962084f9978ff99b2c3ec6ec4c5271e6bd270a00280a5442ff483
3
+ size 1614040466
L14A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0007458544,
3
+ "explained_variance": 0.71227,
4
+ "l1": 20.76,
5
+ "ground_truth_norm": 3.223,
6
+ "reconstructed_norm": 2.733,
7
+ "error_norm": 1.6916,
8
+ "sparsity/below 1e-5": 26050,
9
+ "sparsity/below 1e-6": 26003,
10
+ "positivity": 0.20981,
11
+ "ov_head_live_count": {
12
+ "0": 17,
13
+ "1": 107,
14
+ "2": 53,
15
+ "3": 146,
16
+ "4": 3,
17
+ "5": 114,
18
+ "6": 19,
19
+ "7": 20,
20
+ "8": 92,
21
+ "9": 123,
22
+ "10": 178,
23
+ "11": 182,
24
+ "12": 230,
25
+ "13": 181,
26
+ "14": 45,
27
+ "15": 64,
28
+ "16": 310,
29
+ "17": 9,
30
+ "18": 15,
31
+ "19": 27,
32
+ "20": 154,
33
+ "21": 94,
34
+ "22": 117,
35
+ "23": 164,
36
+ "24": 189,
37
+ "25": 64,
38
+ "26": 190,
39
+ "27": 90,
40
+ "28": 191,
41
+ "29": 178,
42
+ "30": 47,
43
+ "31": 151,
44
+ "32": 5,
45
+ "33": 264,
46
+ "34": 82,
47
+ "35": 121,
48
+ "36": 86,
49
+ "37": 17,
50
+ "38": 55,
51
+ "39": 8,
52
+ "40": 410,
53
+ "41": 107,
54
+ "42": 51,
55
+ "43": 44,
56
+ "44": 233,
57
+ "45": 26,
58
+ "46": 98,
59
+ "47": 34,
60
+ "48": 95,
61
+ "49": 99,
62
+ "50": 162,
63
+ "51": 150,
64
+ "52": 298,
65
+ "53": 10,
66
+ "54": 85,
67
+ "55": 20,
68
+ "56": 69,
69
+ "57": 235,
70
+ "58": 38,
71
+ "59": 187,
72
+ "60": 18,
73
+ "61": 107,
74
+ "62": 66,
75
+ "63": 31
76
+ }
77
+ }
L15A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 15,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 30.77876853942871,
17
+ "out": 3.105498790740967
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L15A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ba1571ed03bc6e7ae938b992bda3eba7a6bf0f93bf2502ff77c7d9ad326e11
3
+ size 1614040466
L15A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0007111974,
3
+ "explained_variance": 0.70865,
4
+ "l1": 18.94,
5
+ "ground_truth_norm": 3.159,
6
+ "reconstructed_norm": 2.682,
7
+ "error_norm": 1.6552,
8
+ "sparsity/below 1e-5": 22472,
9
+ "sparsity/below 1e-6": 22379,
10
+ "positivity": 0.32211,
11
+ "ov_head_live_count": {
12
+ "0": 302,
13
+ "1": 81,
14
+ "2": 361,
15
+ "3": 286,
16
+ "4": 184,
17
+ "5": 178,
18
+ "6": 258,
19
+ "7": 84,
20
+ "8": 88,
21
+ "9": 419,
22
+ "10": 113,
23
+ "11": 161,
24
+ "12": 365,
25
+ "13": 175,
26
+ "14": 327,
27
+ "15": 154,
28
+ "16": 355,
29
+ "17": 289,
30
+ "18": 244,
31
+ "19": 50,
32
+ "20": 93,
33
+ "21": 39,
34
+ "22": 144,
35
+ "23": 16,
36
+ "24": 194,
37
+ "25": 101,
38
+ "26": 122,
39
+ "27": 43,
40
+ "28": 35,
41
+ "29": 59,
42
+ "30": 123,
43
+ "31": 292,
44
+ "32": 281,
45
+ "33": 291,
46
+ "34": 107,
47
+ "35": 380,
48
+ "36": 284,
49
+ "37": 74,
50
+ "38": 108,
51
+ "39": 12,
52
+ "40": 212,
53
+ "41": 272,
54
+ "42": 52,
55
+ "43": 200,
56
+ "44": 71,
57
+ "45": 98,
58
+ "46": 72,
59
+ "47": 2,
60
+ "48": 198,
61
+ "49": 86,
62
+ "50": 208,
63
+ "51": 69,
64
+ "52": 145,
65
+ "53": 141,
66
+ "54": 10,
67
+ "55": 149,
68
+ "56": 82,
69
+ "57": 269,
70
+ "58": 126,
71
+ "59": 26,
72
+ "60": 339,
73
+ "61": 151,
74
+ "62": 121,
75
+ "63": 184
76
+ }
77
+ }
L16A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 16,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 31.657766342163086,
17
+ "out": 3.292898416519165
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L16A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca2047752006f4fed44dc1272e1f8b9a4e12e6750cc1b2efae923ab53ad897e
3
+ size 1614040466
L16A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0006912273,
3
+ "explained_variance": 0.73999,
4
+ "l1": 20.49,
5
+ "ground_truth_norm": 3.339,
6
+ "reconstructed_norm": 2.904,
7
+ "error_norm": 1.6263,
8
+ "sparsity/below 1e-5": 22885,
9
+ "sparsity/below 1e-6": 22814,
10
+ "positivity": 0.3063,
11
+ "ov_head_live_count": {
12
+ "0": 216,
13
+ "1": 304,
14
+ "2": 7,
15
+ "3": 287,
16
+ "4": 379,
17
+ "5": 244,
18
+ "6": 95,
19
+ "7": 110,
20
+ "8": 121,
21
+ "9": 75,
22
+ "10": 3,
23
+ "11": 290,
24
+ "12": 72,
25
+ "13": 57,
26
+ "14": 104,
27
+ "15": 140,
28
+ "16": 29,
29
+ "17": 125,
30
+ "18": 257,
31
+ "19": 305,
32
+ "20": 94,
33
+ "21": 296,
34
+ "22": 154,
35
+ "23": 92,
36
+ "24": 179,
37
+ "25": 278,
38
+ "26": 70,
39
+ "27": 99,
40
+ "28": 176,
41
+ "29": 146,
42
+ "30": 56,
43
+ "31": 57,
44
+ "32": 6,
45
+ "33": 269,
46
+ "34": 194,
47
+ "35": 172,
48
+ "36": 170,
49
+ "37": 150,
50
+ "38": 3,
51
+ "39": 46,
52
+ "40": 83,
53
+ "41": 77,
54
+ "42": 107,
55
+ "43": 11,
56
+ "44": 2,
57
+ "45": 208,
58
+ "46": 352,
59
+ "47": 16,
60
+ "48": 364,
61
+ "49": 88,
62
+ "50": 247,
63
+ "51": 54,
64
+ "52": 141,
65
+ "53": 182,
66
+ "54": 358,
67
+ "55": 305,
68
+ "56": 38,
69
+ "57": 140,
70
+ "58": 186,
71
+ "59": 71,
72
+ "60": 319,
73
+ "61": 375,
74
+ "62": 90,
75
+ "63": 296
76
+ }
77
+ }
L17A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 17,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 31.074840545654297,
17
+ "out": 3.057485818862915
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L17A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60c31a6ebb759b91b11aa70f4b80cb7b3135f745783599134d3df77b72b64017
3
+ size 1614040466
L17A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0005549974,
3
+ "explained_variance": 0.75869,
4
+ "l1": 19.07,
5
+ "ground_truth_norm": 3.08,
6
+ "reconstructed_norm": 2.707,
7
+ "error_norm": 1.4556,
8
+ "sparsity/below 1e-5": 20583,
9
+ "sparsity/below 1e-6": 20264,
10
+ "positivity": 0.39352,
11
+ "ov_head_live_count": {
12
+ "0": 232,
13
+ "1": 170,
14
+ "2": 219,
15
+ "3": 309,
16
+ "4": 196,
17
+ "5": 345,
18
+ "6": 351,
19
+ "7": 127,
20
+ "8": 174,
21
+ "9": 63,
22
+ "10": 150,
23
+ "11": 287,
24
+ "12": 383,
25
+ "13": 268,
26
+ "14": 79,
27
+ "15": 64,
28
+ "16": 21,
29
+ "17": 94,
30
+ "18": 156,
31
+ "19": 148,
32
+ "20": 256,
33
+ "21": 136,
34
+ "22": 282,
35
+ "23": 73,
36
+ "24": 373,
37
+ "25": 354,
38
+ "26": 265,
39
+ "27": 85,
40
+ "28": 44,
41
+ "29": 237,
42
+ "30": 49,
43
+ "31": 71,
44
+ "32": 374,
45
+ "33": 249,
46
+ "34": 215,
47
+ "35": 44,
48
+ "36": 337,
49
+ "37": 118,
50
+ "38": 387,
51
+ "39": 306,
52
+ "40": 119,
53
+ "41": 361,
54
+ "42": 361,
55
+ "43": 285,
56
+ "44": 278,
57
+ "45": 130,
58
+ "46": 318,
59
+ "47": 135,
60
+ "48": 391,
61
+ "49": 172,
62
+ "50": 268,
63
+ "51": 61,
64
+ "52": 232,
65
+ "53": 88,
66
+ "54": 363,
67
+ "55": 79,
68
+ "56": 352,
69
+ "57": 57,
70
+ "58": 205,
71
+ "59": 115,
72
+ "60": 51,
73
+ "61": 70,
74
+ "62": 197,
75
+ "63": 116
76
+ }
77
+ }
L18A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 18,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 31.758373260498047,
17
+ "out": 2.5856893062591553
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L18A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5dc42a30ce6e391e61b142aafba75c0aa18895756149da495f7243341430f04
3
+ size 1614040466
L18A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0003417022,
3
+ "explained_variance": 0.78765,
4
+ "l1": 16.81,
5
+ "ground_truth_norm": 2.625,
6
+ "reconstructed_norm": 2.361,
7
+ "error_norm": 1.1308,
8
+ "sparsity/below 1e-5": 18686,
9
+ "sparsity/below 1e-6": 17927,
10
+ "positivity": 0.48187,
11
+ "ov_head_live_count": {
12
+ "0": 460,
13
+ "1": 349,
14
+ "2": 470,
15
+ "3": 253,
16
+ "4": 337,
17
+ "5": 343,
18
+ "6": 99,
19
+ "7": 414,
20
+ "8": 139,
21
+ "9": 172,
22
+ "10": 185,
23
+ "11": 43,
24
+ "12": 418,
25
+ "13": 387,
26
+ "14": 265,
27
+ "15": 419,
28
+ "16": 170,
29
+ "17": 48,
30
+ "18": 173,
31
+ "19": 255,
32
+ "20": 297,
33
+ "21": 196,
34
+ "22": 118,
35
+ "23": 145,
36
+ "24": 143,
37
+ "25": 402,
38
+ "26": 151,
39
+ "27": 451,
40
+ "28": 76,
41
+ "29": 345,
42
+ "30": 254,
43
+ "31": 208,
44
+ "32": 213,
45
+ "33": 164,
46
+ "34": 299,
47
+ "35": 163,
48
+ "36": 309,
49
+ "37": 311,
50
+ "38": 197,
51
+ "39": 227,
52
+ "40": 221,
53
+ "41": 315,
54
+ "42": 127,
55
+ "43": 389,
56
+ "44": 272,
57
+ "45": 160,
58
+ "46": 74,
59
+ "47": 54,
60
+ "48": 423,
61
+ "49": 127,
62
+ "50": 317,
63
+ "51": 59,
64
+ "52": 395,
65
+ "53": 261,
66
+ "54": 406,
67
+ "55": 306,
68
+ "56": 165,
69
+ "57": 276,
70
+ "58": 96,
71
+ "59": 347,
72
+ "60": 170,
73
+ "61": 339,
74
+ "62": 394,
75
+ "63": 29
76
+ }
77
+ }
L19A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 19,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 31.919294357299805,
17
+ "out": 2.284219980239868
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L19A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1c138b05a313c4085b599ba89e1fd445b66f23ff0ec12023139180cecb7a3ce
3
+ size 1614040466
L19A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0003078198,
3
+ "explained_variance": 0.76338,
4
+ "l1": 15.76,
5
+ "ground_truth_norm": 2.332,
6
+ "reconstructed_norm": 2.067,
7
+ "error_norm": 1.0594,
8
+ "sparsity/below 1e-5": 16110,
9
+ "sparsity/below 1e-6": 14863,
10
+ "positivity": 0.56973,
11
+ "ov_head_live_count": {
12
+ "0": 409,
13
+ "1": 263,
14
+ "2": 416,
15
+ "3": 195,
16
+ "4": 357,
17
+ "5": 319,
18
+ "6": 162,
19
+ "7": 444,
20
+ "8": 244,
21
+ "9": 109,
22
+ "10": 219,
23
+ "11": 417,
24
+ "12": 278,
25
+ "13": 299,
26
+ "14": 350,
27
+ "15": 420,
28
+ "16": 209,
29
+ "17": 39,
30
+ "18": 362,
31
+ "19": 331,
32
+ "20": 204,
33
+ "21": 195,
34
+ "22": 415,
35
+ "23": 382,
36
+ "24": 350,
37
+ "25": 106,
38
+ "26": 342,
39
+ "27": 201,
40
+ "28": 334,
41
+ "29": 324,
42
+ "30": 390,
43
+ "31": 385,
44
+ "32": 412,
45
+ "33": 373,
46
+ "34": 449,
47
+ "35": 224,
48
+ "36": 296,
49
+ "37": 345,
50
+ "38": 86,
51
+ "39": 422,
52
+ "40": 327,
53
+ "41": 67,
54
+ "42": 447,
55
+ "43": 159,
56
+ "44": 278,
57
+ "45": 251,
58
+ "46": 303,
59
+ "47": 357,
60
+ "48": 147,
61
+ "49": 48,
62
+ "50": 301,
63
+ "51": 154,
64
+ "52": 177,
65
+ "53": 252,
66
+ "54": 435,
67
+ "55": 173,
68
+ "56": 445,
69
+ "57": 379,
70
+ "58": 175,
71
+ "59": 359,
72
+ "60": 399,
73
+ "61": 364,
74
+ "62": 251,
75
+ "63": 344
76
+ }
77
+ }
L20A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 20,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 32.97787094116211,
17
+ "out": 2.3633012771606445
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L20A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63b1bc4cde7614df26b9cadb4d6cb29796862ee6167c444359d1f6cc4f6f98b2
3
+ size 1614040466
L20A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0003548504,
3
+ "explained_variance": 0.75766,
4
+ "l1": 16.56,
5
+ "ground_truth_norm": 2.379,
6
+ "reconstructed_norm": 2.083,
7
+ "error_norm": 1.1285,
8
+ "sparsity/below 1e-5": 17100,
9
+ "sparsity/below 1e-6": 16072,
10
+ "positivity": 0.5462,
11
+ "ov_head_live_count": {
12
+ "0": 419,
13
+ "1": 199,
14
+ "2": 234,
15
+ "3": 327,
16
+ "4": 80,
17
+ "5": 479,
18
+ "6": 245,
19
+ "7": 362,
20
+ "8": 93,
21
+ "9": 449,
22
+ "10": 274,
23
+ "11": 78,
24
+ "12": 298,
25
+ "13": 315,
26
+ "14": 406,
27
+ "15": 257,
28
+ "16": 353,
29
+ "17": 162,
30
+ "18": 330,
31
+ "19": 359,
32
+ "20": 139,
33
+ "21": 102,
34
+ "22": 371,
35
+ "23": 330,
36
+ "24": 444,
37
+ "25": 309,
38
+ "26": 276,
39
+ "27": 160,
40
+ "28": 208,
41
+ "29": 345,
42
+ "30": 39,
43
+ "31": 93,
44
+ "32": 264,
45
+ "33": 58,
46
+ "34": 221,
47
+ "35": 229,
48
+ "36": 411,
49
+ "37": 212,
50
+ "38": 323,
51
+ "39": 335,
52
+ "40": 282,
53
+ "41": 327,
54
+ "42": 430,
55
+ "43": 308,
56
+ "44": 298,
57
+ "45": 420,
58
+ "46": 420,
59
+ "47": 279,
60
+ "48": 179,
61
+ "49": 193,
62
+ "50": 146,
63
+ "51": 374,
64
+ "52": 449,
65
+ "53": 423,
66
+ "54": 261,
67
+ "55": 311,
68
+ "56": 462,
69
+ "57": 118,
70
+ "58": 398,
71
+ "59": 107,
72
+ "60": 147,
73
+ "61": 341,
74
+ "62": 273,
75
+ "63": 364
76
+ }
77
+ }
L21A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 21,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 32.402626037597656,
17
+ "out": 3.294581651687622
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L21A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:551b4bd4081bce1b00c2cf43eeaefb76498251abd586663728edcbb31e78c91a
3
+ size 1614040466
L21A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0005338052,
3
+ "explained_variance": 0.80594,
4
+ "l1": 21.32,
5
+ "ground_truth_norm": 3.356,
6
+ "reconstructed_norm": 3.039,
7
+ "error_norm": 1.4049,
8
+ "sparsity/below 1e-5": 19748,
9
+ "sparsity/below 1e-6": 19430,
10
+ "positivity": 0.44174,
11
+ "ov_head_live_count": {
12
+ "0": 85,
13
+ "1": 155,
14
+ "2": 197,
15
+ "3": 284,
16
+ "4": 319,
17
+ "5": 171,
18
+ "6": 218,
19
+ "7": 375,
20
+ "8": 86,
21
+ "9": 91,
22
+ "10": 352,
23
+ "11": 186,
24
+ "12": 356,
25
+ "13": 116,
26
+ "14": 289,
27
+ "15": 77,
28
+ "16": 220,
29
+ "17": 409,
30
+ "18": 84,
31
+ "19": 106,
32
+ "20": 89,
33
+ "21": 345,
34
+ "22": 311,
35
+ "23": 271,
36
+ "24": 181,
37
+ "25": 138,
38
+ "26": 282,
39
+ "27": 72,
40
+ "28": 124,
41
+ "29": 292,
42
+ "30": 199,
43
+ "31": 219,
44
+ "32": 230,
45
+ "33": 137,
46
+ "34": 362,
47
+ "35": 258,
48
+ "36": 226,
49
+ "37": 260,
50
+ "38": 287,
51
+ "39": 199,
52
+ "40": 315,
53
+ "41": 109,
54
+ "42": 311,
55
+ "43": 303,
56
+ "44": 281,
57
+ "45": 330,
58
+ "46": 390,
59
+ "47": 175,
60
+ "48": 307,
61
+ "49": 224,
62
+ "50": 292,
63
+ "51": 222,
64
+ "52": 446,
65
+ "53": 276,
66
+ "54": 262,
67
+ "55": 57,
68
+ "56": 209,
69
+ "57": 90,
70
+ "58": 151,
71
+ "59": 216,
72
+ "60": 184,
73
+ "61": 87,
74
+ "62": 362,
75
+ "63": 218
76
+ }
77
+ }
L22A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 22,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 34.090354919433594,
17
+ "out": 2.561861753463745
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L22A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:632b82ac74d6c1a42d3da0b866e87d4ce2efa80bfde820f5144ff7d86084a830
3
+ size 1614040466
L22A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0004232735,
3
+ "explained_variance": 0.76295,
4
+ "l1": 17.34,
5
+ "ground_truth_norm": 2.654,
6
+ "reconstructed_norm": 2.344,
7
+ "error_norm": 1.2268,
8
+ "sparsity/below 1e-5": 16999,
9
+ "sparsity/below 1e-6": 16475,
10
+ "positivity": 0.52084,
11
+ "ov_head_live_count": {
12
+ "0": 184,
13
+ "1": 358,
14
+ "2": 169,
15
+ "3": 436,
16
+ "4": 375,
17
+ "5": 156,
18
+ "6": 52,
19
+ "7": 228,
20
+ "8": 286,
21
+ "9": 27,
22
+ "10": 274,
23
+ "11": 254,
24
+ "12": 38,
25
+ "13": 395,
26
+ "14": 154,
27
+ "15": 453,
28
+ "16": 430,
29
+ "17": 426,
30
+ "18": 321,
31
+ "19": 396,
32
+ "20": 119,
33
+ "21": 307,
34
+ "22": 284,
35
+ "23": 277,
36
+ "24": 66,
37
+ "25": 261,
38
+ "26": 156,
39
+ "27": 447,
40
+ "28": 368,
41
+ "29": 329,
42
+ "30": 400,
43
+ "31": 360,
44
+ "32": 370,
45
+ "33": 363,
46
+ "34": 91,
47
+ "35": 264,
48
+ "36": 145,
49
+ "37": 247,
50
+ "38": 38,
51
+ "39": 124,
52
+ "40": 416,
53
+ "41": 252,
54
+ "42": 422,
55
+ "43": 274,
56
+ "44": 261,
57
+ "45": 84,
58
+ "46": 260,
59
+ "47": 295,
60
+ "48": 268,
61
+ "49": 324,
62
+ "50": 161,
63
+ "51": 176,
64
+ "52": 86,
65
+ "53": 219,
66
+ "54": 369,
67
+ "55": 285,
68
+ "56": 250,
69
+ "57": 395,
70
+ "58": 382,
71
+ "59": 273,
72
+ "60": 206,
73
+ "61": 388,
74
+ "62": 418,
75
+ "63": 175
76
+ }
77
+ }
L23A/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "d_qk_head": 256,
3
+ "d_ov_head": 1,
4
+ "n_qk_heads": 64,
5
+ "n_ov_heads": 32768,
6
+ "device": "cuda",
7
+ "dtype": "torch.float",
8
+ "virtual_kv_num": 0,
9
+ "use_z_relu": true,
10
+ "n_ctx": 1024,
11
+ "layer": 23,
12
+ "model_name": "meta-llama/Llama-3.1-8B",
13
+ "mode": "top_k",
14
+ "top_k": 128,
15
+ "avg_norm": {
16
+ "in": 33.478187561035156,
17
+ "out": 2.9890081882476807
18
+ },
19
+ "d_model": 4096,
20
+ "attn_scale": 11.313708498984761,
21
+ "positional_embedding_type": "rotary",
22
+ "rotary_scale": 1,
23
+ "rotary_dim": 256,
24
+ "rotary_base": 500000.0,
25
+ "rotary_adjacent_pairs": false,
26
+ "use_NTK_by_parts_rope": true,
27
+ "NTK_by_parts_low_freq_factor": 1.0,
28
+ "NTK_by_parts_high_freq_factor": 4.0,
29
+ "NTK_by_parts_factor": 8.0,
30
+ "old_context_len": 8192
31
+ }
L23A/final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6955da6d7a1e9375d0b67241d9f99f1143f752d1aa25ed4de710927f0746dacc
3
+ size 1614040466
L23A/metrics.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse_loss": 0.0004005697,
3
+ "explained_variance": 0.80796,
4
+ "l1": 18.76,
5
+ "ground_truth_norm": 2.916,
6
+ "reconstructed_norm": 2.651,
7
+ "error_norm": 1.1865,
8
+ "sparsity/below 1e-5": 15034,
9
+ "sparsity/below 1e-6": 14151,
10
+ "positivity": 0.64301,
11
+ "ov_head_live_count": {
12
+ "0": 292,
13
+ "1": 314,
14
+ "2": 155,
15
+ "3": 317,
16
+ "4": 338,
17
+ "5": 308,
18
+ "6": 276,
19
+ "7": 123,
20
+ "8": 436,
21
+ "9": 274,
22
+ "10": 448,
23
+ "11": 430,
24
+ "12": 355,
25
+ "13": 341,
26
+ "14": 112,
27
+ "15": 192,
28
+ "16": 180,
29
+ "17": 395,
30
+ "18": 169,
31
+ "19": 414,
32
+ "20": 395,
33
+ "21": 404,
34
+ "22": 309,
35
+ "23": 490,
36
+ "24": 437,
37
+ "25": 419,
38
+ "26": 411,
39
+ "27": 131,
40
+ "28": 267,
41
+ "29": 364,
42
+ "30": 119,
43
+ "31": 471,
44
+ "32": 351,
45
+ "33": 375,
46
+ "34": 169,
47
+ "35": 448,
48
+ "36": 493,
49
+ "37": 270,
50
+ "38": 176,
51
+ "39": 339,
52
+ "40": 182,
53
+ "41": 336,
54
+ "42": 357,
55
+ "43": 367,
56
+ "44": 478,
57
+ "45": 335,
58
+ "46": 407,
59
+ "47": 498,
60
+ "48": 97,
61
+ "49": 433,
62
+ "50": 414,
63
+ "51": 423,
64
+ "52": 342,
65
+ "53": 111,
66
+ "54": 389,
67
+ "55": 344,
68
+ "56": 269,
69
+ "57": 379,
70
+ "58": 273,
71
+ "59": 358,
72
+ "60": 411,
73
+ "61": 443,
74
+ "62": 229,
75
+ "63": 488
76
+ }
77
+ }