Upload folder using huggingface_hub
Browse files- L0/config.json +31 -0
- L0/final.pth +3 -0
- L0/metrics.json +11 -0
- L1/config.json +31 -0
- L1/final.pth +3 -0
- L1/metrics.json +11 -0
- L10/config.json +31 -0
- L10/final.pth +3 -0
- L10/metrics.json +11 -0
- L11/config.json +31 -0
- L11/final.pth +3 -0
- L11/metrics.json +11 -0
- L2/config.json +31 -0
- L2/final.pth +3 -0
- L2/metrics.json +11 -0
- L3/config.json +31 -0
- L3/final.pth +3 -0
- L3/metrics.json +11 -0
- L4/config.json +31 -0
- L4/final.pth +3 -0
- L4/metrics.json +11 -0
- L5/config.json +31 -0
- L5/final.pth +3 -0
- L5/metrics.json +11 -0
- L6/config.json +31 -0
- L6/final.pth +3 -0
- L6/metrics.json +11 -0
- L7/config.json +31 -0
- L7/final.pth +3 -0
- L7/metrics.json +11 -0
- L8/config.json +31 -0
- L8/final.pth +3 -0
- L8/metrics.json +11 -0
- L9/config.json +31 -0
- L9/final.pth +3 -0
- L9/metrics.json +11 -0
L0/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 0,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 23.102449417114258,
|
17 |
+
"out": 3.8546135425567627
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L0/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2851325a82fcf4797fb18ac0d279d05ca10b619d5920a9edb2f4dc98a3355db
|
3 |
+
size 57006482
|
L0/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00106,
|
3 |
+
"explained_variance": 0.93259,
|
4 |
+
"l1": 15.9,
|
5 |
+
"ground_truth_norm": 3.862,
|
6 |
+
"reconstructed_norm": 3.764,
|
7 |
+
"error_norm": 0.801,
|
8 |
+
"sparsity/below 1e-5": 3331,
|
9 |
+
"sparsity/below 1e-6": 3316,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L1/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 1,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 27.244165420532227,
|
17 |
+
"out": 3.7663726806640625
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L1/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcd30b078958cf045da8d155e337a953dd97ed6ee760b86928dec33d623bd1b6
|
3 |
+
size 57006482
|
L1/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0011,
|
3 |
+
"explained_variance": 0.92292,
|
4 |
+
"l1": 16.14,
|
5 |
+
"ground_truth_norm": 3.762,
|
6 |
+
"reconstructed_norm": 3.65,
|
7 |
+
"error_norm": 0.8645,
|
8 |
+
"sparsity/below 1e-5": 2975,
|
9 |
+
"sparsity/below 1e-6": 2961,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L10/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 10,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 144.09291076660156,
|
17 |
+
"out": 3.9614734649658203
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L10/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:54b1c9c7007f81d230ba61c52810ec8b3232dc5fe33a7e2171d1297edf82342a
|
3 |
+
size 57006482
|
L10/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00277,
|
3 |
+
"explained_variance": 0.8377,
|
4 |
+
"l1": 16.73,
|
5 |
+
"ground_truth_norm": 3.964,
|
6 |
+
"reconstructed_norm": 3.734,
|
7 |
+
"error_norm": 1.2477,
|
8 |
+
"sparsity/below 1e-5": 3543,
|
9 |
+
"sparsity/below 1e-6": 3510,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L11/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 11,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 150.25917053222656,
|
17 |
+
"out": 20.201923370361328
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L11/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:086852435fe7be016564297809b2083e7e2e036641fcf2dc4e7926e6aaa7b19c
|
3 |
+
size 57006482
|
L11/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00404,
|
3 |
+
"explained_variance": 0.77293,
|
4 |
+
"l1": 18.28,
|
5 |
+
"ground_truth_norm": 20.238,
|
6 |
+
"reconstructed_norm": 20.157,
|
7 |
+
"error_norm": 1.5937,
|
8 |
+
"sparsity/below 1e-5": 4465,
|
9 |
+
"sparsity/below 1e-6": 4406,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L2/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 2,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 27.967687606811523,
|
17 |
+
"out": 4.815014839172363
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L2/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c93793b9010c647284edb251923afc56ffae582effc72edf1ea0e1c2334d985
|
3 |
+
size 57006482
|
L2/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00293,
|
3 |
+
"explained_variance": 0.88528,
|
4 |
+
"l1": 21.08,
|
5 |
+
"ground_truth_norm": 4.764,
|
6 |
+
"reconstructed_norm": 4.533,
|
7 |
+
"error_norm": 1.4217,
|
8 |
+
"sparsity/below 1e-5": 2958,
|
9 |
+
"sparsity/below 1e-6": 2950,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L3/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 3,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 29.04673957824707,
|
17 |
+
"out": 7.051708698272705
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L3/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69691f0beacbf8f0f7b58e245d20f60f764cc445fc418510ae3d1c176a5667e9
|
3 |
+
size 57006482
|
L3/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00703,
|
3 |
+
"explained_variance": 0.883,
|
4 |
+
"l1": 39.99,
|
5 |
+
"ground_truth_norm": 7.057,
|
6 |
+
"reconstructed_norm": 6.667,
|
7 |
+
"error_norm": 2.2736,
|
8 |
+
"sparsity/below 1e-5": 4204,
|
9 |
+
"sparsity/below 1e-6": 4204,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L4/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 4,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 39.33174514770508,
|
17 |
+
"out": 5.237796306610107
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L4/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:67a2bb6b113eb111dfb759d38f61fdc4bbf6e9e6e99c661608dc6d3fa46aa1b2
|
3 |
+
size 57006482
|
L4/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00345,
|
3 |
+
"explained_variance": 0.88722,
|
4 |
+
"l1": 23.17,
|
5 |
+
"ground_truth_norm": 5.202,
|
6 |
+
"reconstructed_norm": 4.961,
|
7 |
+
"error_norm": 1.5414,
|
8 |
+
"sparsity/below 1e-5": 3891,
|
9 |
+
"sparsity/below 1e-6": 3887,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L5/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 5,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 40.27893829345703,
|
17 |
+
"out": 5.156437873840332
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L5/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d430374109057795a15794d3e5d72fe6eb41cb79687e3d70e7703b4e000120de
|
3 |
+
size 57006482
|
L5/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0039,
|
3 |
+
"explained_variance": 0.88818,
|
4 |
+
"l1": 24.1,
|
5 |
+
"ground_truth_norm": 5.135,
|
6 |
+
"reconstructed_norm": 4.869,
|
7 |
+
"error_norm": 1.5905,
|
8 |
+
"sparsity/below 1e-5": 3800,
|
9 |
+
"sparsity/below 1e-6": 3796,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L6/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 6,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 41.99013900756836,
|
17 |
+
"out": 4.518642902374268
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L6/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:608cd82d0fc3177f79ff8d2afcd49ccdd5eb14bb1c3cc733013a4f3f37deec5a
|
3 |
+
size 57006482
|
L6/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00344,
|
3 |
+
"explained_variance": 0.85124,
|
4 |
+
"l1": 19.99,
|
5 |
+
"ground_truth_norm": 4.526,
|
6 |
+
"reconstructed_norm": 4.249,
|
7 |
+
"error_norm": 1.5198,
|
8 |
+
"sparsity/below 1e-5": 3257,
|
9 |
+
"sparsity/below 1e-6": 3251,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L7/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 7,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 57.60716247558594,
|
17 |
+
"out": 5.574440956115723
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L7/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:061d14ad379b786671e815ebe16c3964cfcac5c0b956bcbb0e05c11f0145a443
|
3 |
+
size 57006482
|
L7/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00393,
|
3 |
+
"explained_variance": 0.87354,
|
4 |
+
"l1": 26.84,
|
5 |
+
"ground_truth_norm": 5.58,
|
6 |
+
"reconstructed_norm": 5.312,
|
7 |
+
"error_norm": 1.6651,
|
8 |
+
"sparsity/below 1e-5": 4378,
|
9 |
+
"sparsity/below 1e-6": 4367,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L8/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 8,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 86.12666320800781,
|
17 |
+
"out": 5.070123672485352
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L8/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5db36f3ba8ff080181ec1f368351c9e13021056ecce6b602347fa2c8088a2487
|
3 |
+
size 57006482
|
L8/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.0032,
|
3 |
+
"explained_variance": 0.88431,
|
4 |
+
"l1": 22.02,
|
5 |
+
"ground_truth_norm": 5.037,
|
6 |
+
"reconstructed_norm": 4.824,
|
7 |
+
"error_norm": 1.3827,
|
8 |
+
"sparsity/below 1e-5": 3809,
|
9 |
+
"sparsity/below 1e-6": 3727,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|
L9/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"d_qk_head": 128,
|
3 |
+
"d_ov_head": 1,
|
4 |
+
"n_qk_heads": 24,
|
5 |
+
"n_ov_heads": 6144,
|
6 |
+
"device": "cuda",
|
7 |
+
"dtype": "torch.float",
|
8 |
+
"virtual_kv_num": 0,
|
9 |
+
"use_z_relu": true,
|
10 |
+
"n_ctx": 256,
|
11 |
+
"layer": 9,
|
12 |
+
"model_name": "EleutherAI/pythia-160m",
|
13 |
+
"mode": "top_k",
|
14 |
+
"top_k": 64,
|
15 |
+
"avg_norm": {
|
16 |
+
"in": 117.01543426513672,
|
17 |
+
"out": 4.0516133308410645
|
18 |
+
},
|
19 |
+
"d_model": 768,
|
20 |
+
"attn_scale": 8.0,
|
21 |
+
"positional_embedding_type": "rotary",
|
22 |
+
"rotary_scale": 1,
|
23 |
+
"rotary_dim": 128,
|
24 |
+
"rotary_base": 10000,
|
25 |
+
"rotary_adjacent_pairs": false,
|
26 |
+
"use_NTK_by_parts_rope": false,
|
27 |
+
"NTK_by_parts_low_freq_factor": null,
|
28 |
+
"NTK_by_parts_high_freq_factor": null,
|
29 |
+
"NTK_by_parts_factor": null,
|
30 |
+
"old_context_len": null
|
31 |
+
}
|
L9/final.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff7c8c57c2a245adec188f711e33c393da6f786e92908155b91ec059d65dec62
|
3 |
+
size 57006482
|
L9/metrics.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"mse_loss": 0.00258,
|
3 |
+
"explained_variance": 0.87377,
|
4 |
+
"l1": 18.61,
|
5 |
+
"ground_truth_norm": 4.069,
|
6 |
+
"reconstructed_norm": 3.853,
|
7 |
+
"error_norm": 1.223,
|
8 |
+
"sparsity/below 1e-5": 3197,
|
9 |
+
"sparsity/below 1e-6": 3019,
|
10 |
+
"positivity": 1.0
|
11 |
+
}
|