WARNING EXPERIMENTAL IQ4_KSS
Browse files
README.md
CHANGED
@@ -41,11 +41,129 @@ These first two are just test quants for baseline perplexity comparison:
|
|
41 |
* `Q8_0` 108.119 GiB (8.505 BPW)
|
42 |
- Final estimate: PPL = TODO
|
43 |
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
## Quick Start
|
47 |
```bash
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
```
|
50 |
|
51 |
## References
|
|
|
41 |
* `Q8_0` 108.119 GiB (8.505 BPW)
|
42 |
- Final estimate: PPL = TODO
|
43 |
|
44 |
+
## IQ4_KSS 54.124 GiB (4.258 BPW)
|
45 |
+
|
46 |
+
<details>
|
47 |
+
|
48 |
+
<summary>👈 Secret Recipe</summary>
|
49 |
+
|
50 |
+
```bash
|
51 |
+
#!/usr/bin/env bash
|
52 |
+
|
53 |
+
# 620756992 | 4096, 151552, 1, 1 | Q8_0 | token_embd.weight
|
54 |
+
#
|
55 |
+
# 44826624 | 10944, 4096, 1, 1 | Q8_0 | blk.0.ffn_down.weight
|
56 |
+
# 44826624 | 4096, 10944, 1, 1 | Q8_0 | blk.0.ffn_gate.weight
|
57 |
+
# 44826624 | 4096, 10944, 1, 1 | Q8_0 | blk.0.ffn_up.weight
|
58 |
+
# 4096 | 4096, 1, 1, 1 | F32 | blk.0.attn_norm.weight
|
59 |
+
# 4096 | 4096, 1, 1, 1 | F32 | blk.0.ffn_norm.weight
|
60 |
+
# 1024 | 1024, 1, 1, 1 | F32 | blk.0.attn_k.bias
|
61 |
+
# 4194304 | 4096, 1024, 1, 1 | Q8_0 | blk.0.attn_k.weight
|
62 |
+
# 50331648 | 12288, 4096, 1, 1 | Q8_0 | blk.0.attn_output.weight
|
63 |
+
# 4194304 | 4096, 1024, 1, 1 | Q8_0 | blk.0.attn_v.weight
|
64 |
+
# 50331648 | 4096, 12288, 1, 1 | Q8_0 | blk.0.attn_q.weight
|
65 |
+
# 12288 | 12288, 1, 1, 1 | F32 | blk.0.attn_q.bias
|
66 |
+
# 1024 | 1024, 1, 1, 1 | F32 | blk.0.attn_v.bias
|
67 |
+
#
|
68 |
+
# 738197504 | 1408, 4096, 128, 1 | Q8_0 | blk.1.ffn_down_exps.weight
|
69 |
+
# 738197504 | 4096, 1408, 128, 1 | Q8_0 | blk.1.ffn_gate_exps.weight
|
70 |
+
# 738197504 | 4096, 1408, 128, 1 | Q8_0 | blk.1.ffn_up_exps.weight
|
71 |
+
# 4096 | 4096, 1, 1, 1 | F32 | blk.1.attn_norm.weight
|
72 |
+
# 128 | 128, 1, 1, 1 | F32 | blk.1.ffn_gate_inp.bias
|
73 |
+
# 524288 | 4096, 128, 1, 1 | F32 | blk.1.ffn_gate_inp.weight
|
74 |
+
# 5767168 | 1408, 4096, 1, 1 | Q8_0 | blk.1.ffn_down_shexp.weight
|
75 |
+
# 5767168 | 4096, 1408, 1, 1 | Q8_0 | blk.1.ffn_gate_shexp.weight
|
76 |
+
# 5767168 | 4096, 1408, 1, 1 | Q8_0 | blk.1.ffn_up_shexp.weight
|
77 |
+
# 4194304 | 4096, 1024, 1, 1 | Q8_0 | blk.1.attn_k.weight
|
78 |
+
# 50331648 | 12288, 4096, 1, 1 | Q8_0 | blk.1.attn_output.weight
|
79 |
+
# 50331648 | 4096, 12288, 1, 1 | Q8_0 | blk.1.attn_q.weight
|
80 |
+
# 4194304 | 4096, 1024, 1, 1 | Q8_0 | blk.1.attn_v.weight
|
81 |
+
# 4096 | 4096, 1, 1, 1 | F32 | blk.1.ffn_norm.weight
|
82 |
+
# 1024 | 1024, 1, 1, 1 | F32 | blk.1.attn_k.bias
|
83 |
+
# 12288 | 12288, 1, 1, 1 | F32 | blk.1.attn_q.bias
|
84 |
+
# 1024 | 1024, 1, 1, 1 | F32 | blk.1.attn_v.bias
|
85 |
+
|
86 |
+
# 620756992 | 4096, 151552, 1, 1 | Q8_0 | output.weight
|
87 |
+
|
88 |
+
custom="
|
89 |
+
# 47 Repeating Layers [0-46]
|
90 |
+
|
91 |
+
# Attention
|
92 |
+
#blk\.(0)\.attn_q.*=q8_0
|
93 |
+
#blk\.(0)\.attn_k.*=q8_0
|
94 |
+
#blk\.(0)\.attn_v.*=q8_0
|
95 |
+
#blk\.(0)\.attn_output.*=q8_0
|
96 |
+
|
97 |
+
blk\..*\.attn_q.*=iq5_ks
|
98 |
+
blk\..*\.attn_k.*=iq5_ks
|
99 |
+
blk\..*\.attn_v.*=iq5_ks
|
100 |
+
blk\..*\.attn_output.*=iq5_ks
|
101 |
+
|
102 |
+
# First 1 Dense Layers [0]
|
103 |
+
blk\..*\.ffn_down\.weight=q6_0
|
104 |
+
blk\..*\.ffn_(gate|up)\.weight=iq5_ks
|
105 |
+
|
106 |
+
# Shared Expert Layers [2-46]
|
107 |
+
blk\..*\.ffn_down_shexp\.weight=q6_0
|
108 |
+
blk\..*\.ffn_(gate|up)_shexp\.weight=iq5_ks
|
109 |
+
|
110 |
+
# Routed Experts Layers [2-46]
|
111 |
+
#blk\.(3|92)\.ffn_down_exps\.weight=q8_0
|
112 |
+
#blk\.(3|92)\.ffn_(gate|up)_exps\.weight=q8_0
|
113 |
+
|
114 |
+
blk\..*\.ffn_down_exps\.weight=iq4_nl
|
115 |
+
blk\..*\.ffn_(gate|up)_exps\.weight=iq4_kss
|
116 |
+
|
117 |
+
# Non-Repeating Layers
|
118 |
+
token_embd\.weight=iq4_k
|
119 |
+
output\.weight=iq6_k
|
120 |
+
"
|
121 |
+
|
122 |
+
custom=$(
|
123 |
+
echo "$custom" | grep -v '^#' | \
|
124 |
+
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
|
125 |
+
)
|
126 |
+
|
127 |
+
numactl -N 1 -m 1 \
|
128 |
+
./build/bin/llama-quantize \
|
129 |
+
--custom-q "$custom" \
|
130 |
+
--imatrix /mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/imatrix-GLM-4.5-Air-BF16.dat \
|
131 |
+
/mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/GLM-4.5-Air-128x8.1B-BF16-00001-of-00005.gguf \
|
132 |
+
/mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/GLM-4.5-Air-IQ4_KSS.gguf \
|
133 |
+
IQ4_KSS \
|
134 |
+
192
|
135 |
+
```
|
136 |
+
|
137 |
+
</details>
|
138 |
|
139 |
## Quick Start
|
140 |
```bash
|
141 |
+
# Clone and checkout experimental PR
|
142 |
+
$ git clone https://github.com/ikawrakow/ik_llama.cpp
|
143 |
+
$ cd ik_llama.cpp
|
144 |
+
$ git remote add Thireus https://github.com/Thireus/ik_llama.cpp.git
|
145 |
+
$ git checkout glm-4.5-clean
|
146 |
+
|
147 |
+
# Build for hybrid CPU+CUDA
|
148 |
+
$ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON -DGGML_BLAS=OFF -DGGML_SCHED_MAX_COPIES=1
|
149 |
+
$ cmake --build build --config Release -j $(nproc)
|
150 |
+
|
151 |
+
# Test Experimental GGUF
|
152 |
+
$ ./build/bin/llama-server \
|
153 |
+
--model WARNING-EXPERIMENTAL-IKLLAMACPP-ONLY-GLM-4.5-Air-IQ4_KSS-00001-of-00002.gguf \
|
154 |
+
--alias ubergarm/GLM-4.5-Air-IQ4_KSS \
|
155 |
+
--ctx-size 32768 \
|
156 |
+
-fa -fmoe \
|
157 |
+
-ctk q8_0 -ctv q8_0 \
|
158 |
+
--chat-template chatglm4 \
|
159 |
+
-ub 4096 -b 4096 \
|
160 |
+
-ngl 99 \
|
161 |
+
-ot exps=CPU \
|
162 |
+
--parallel 1 \
|
163 |
+
--threads 8 \
|
164 |
+
--host 127.0.0.1 \
|
165 |
+
--port 8080 \
|
166 |
+
--no-mmap
|
167 |
```
|
168 |
|
169 |
## References
|