Release IQ3_KS with perplexity info
Browse files- README.md +68 -0
- images/perplexity.png +2 -2
README.md
CHANGED
@@ -283,6 +283,74 @@ numactl -N 0 -m 0 \
|
|
283 |
|
284 |
</details>
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
## IQ2_KL 43.870 GiB (3.411 BPW)
|
287 |
Final estimate: PPL = 5.0697 +/- 0.03166
|
288 |
|
|
|
283 |
|
284 |
</details>
|
285 |
|
286 |
+
## IQ3_KS 49.072 GiB (3.816 BPW)
|
287 |
+
Final estimate: PPL = 4.7975 +/- 0.02972
|
288 |
+
|
289 |
+
<details>
|
290 |
+
|
291 |
+
<summary>👈 Secret Recipe</summary>
|
292 |
+
|
293 |
+
```bash
|
294 |
+
#!/usr/bin/env bash
|
295 |
+
|
296 |
+
custom="
|
297 |
+
# 47 Repeating Layers [0-46]
|
298 |
+
# Note: All ffn_down.* layers are not divisible by 256 so have limited quantization options.
|
299 |
+
|
300 |
+
# Attention
|
301 |
+
blk\.(0|1)\.attn_q.*=q8_0
|
302 |
+
blk\.(0|1)\.attn_k.*=q8_0
|
303 |
+
blk\.(0|1)\.attn_v.*=q8_0
|
304 |
+
blk\.(0|1)\.attn_output.*=q8_0
|
305 |
+
|
306 |
+
blk\..*\.attn_q.*=iq5_ks
|
307 |
+
blk\..*\.attn_k.*=iq5_ks
|
308 |
+
blk\..*\.attn_v.*=iq5_ks
|
309 |
+
blk\..*\.attn_output.*=iq5_ks
|
310 |
+
|
311 |
+
# First 1 Dense Layers [0]
|
312 |
+
blk\..*\.ffn_down\.weight=q6_0
|
313 |
+
blk\..*\.ffn_(gate|up)\.weight=iq5_ks
|
314 |
+
|
315 |
+
# Shared Expert Layers [1-46]
|
316 |
+
blk\..*\.ffn_down_shexp\.weight=q6_0
|
317 |
+
blk\..*\.ffn_(gate|up)_shexp\.weight=iq5_ks
|
318 |
+
|
319 |
+
# Routed Experts Layers [1-46]
|
320 |
+
blk\.(1)\.ffn_down_exps\.weight=q6_0
|
321 |
+
blk\.(1)\.ffn_(gate|up)_exps\.weight=iq5_ks
|
322 |
+
|
323 |
+
blk\..*\.ffn_down_exps\.weight=iq4_nl
|
324 |
+
blk\..*\.ffn_(gate|up)_exps\.weight=iq3_ks
|
325 |
+
|
326 |
+
# Non-Repeating Layers
|
327 |
+
token_embd\.weight=iq4_k
|
328 |
+
output\.weight=iq6_k
|
329 |
+
|
330 |
+
# NextN MTP Layer [46]
|
331 |
+
blk\..*\.nextn\.embed_tokens\.weight=iq5_ks
|
332 |
+
blk\..*\.nextn\.shared_head_head\.weight=iq5_ks
|
333 |
+
blk\..*\.nextn\.eh_proj\.weight=q8_0
|
334 |
+
"
|
335 |
+
|
336 |
+
custom=$(
|
337 |
+
echo "$custom" | grep -v '^#' | \
|
338 |
+
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
|
339 |
+
)
|
340 |
+
|
341 |
+
numactl -N 0 -m 0 \
|
342 |
+
./build/bin/llama-quantize \
|
343 |
+
--custom-q "$custom" \
|
344 |
+
--imatrix /mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/imatrix-GLM-4.5-Air-BF16.dat \
|
345 |
+
/mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/GLM-4.5-Air-128x9.4B-BF16-00001-of-00005.gguf \
|
346 |
+
/mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/GLM-4.5-Air-PR624-IQ3_KS.gguf \
|
347 |
+
IQ3_KS \
|
348 |
+
192
|
349 |
+
```
|
350 |
+
|
351 |
+
</summary>
|
352 |
+
|
353 |
+
|
354 |
## IQ2_KL 43.870 GiB (3.411 BPW)
|
355 |
Final estimate: PPL = 5.0697 +/- 0.03166
|
356 |
|
images/perplexity.png
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|