ubergarm
/

GLM-4.5-Air-GGUF

Text Generation

GGUF

imatrix

conversational

ik_llama.cpp

Model card Files Files and versions Community

ubergarm commited on Aug 2

Commit

30a0b69

1 Parent(s): cdf935b

WARNING EXPERIMENTAL IQ4_KSS

Browse files

Files changed (1) hide show

README.md +120 -2

README.md CHANGED Viewed

@@ -41,11 +41,129 @@ These first two are just test quants for baseline perplexity comparison:
 * `Q8_0` 108.119 GiB (8.505 BPW)
   - Final estimate: PPL = TODO
-TODO
 ## Quick Start
 ```bash
-TODO
 ```
 ## References

 * `Q8_0` 108.119 GiB (8.505 BPW)
   - Final estimate: PPL = TODO
+## IQ4_KSS 54.124 GiB (4.258 BPW)
+<details>
+<summary>👈 Secret Recipe</summary>
+```bash
+#!/usr/bin/env bash
+# 620756992 |  4096, 151552,     1,     1 | Q8_0    | token_embd.weight
+#
+#  44826624 | 10944,   4096,     1,     1 | Q8_0    | blk.0.ffn_down.weight
+#  44826624 |  4096,  10944,     1,     1 | Q8_0    | blk.0.ffn_gate.weight
+#  44826624 |  4096,  10944,     1,     1 | Q8_0    | blk.0.ffn_up.weight
+#      4096 |  4096,      1,     1,     1 | F32     | blk.0.attn_norm.weight
+#      4096 |  4096,      1,     1,     1 | F32     | blk.0.ffn_norm.weight
+#      1024 |  1024,      1,     1,     1 | F32     | blk.0.attn_k.bias
+#   4194304 |  4096,   1024,     1,     1 | Q8_0    | blk.0.attn_k.weight
+#  50331648 | 12288,   4096,     1,     1 | Q8_0    | blk.0.attn_output.weight
+#   4194304 |  4096,   1024,     1,     1 | Q8_0    | blk.0.attn_v.weight
+#  50331648 |  4096,  12288,     1,     1 | Q8_0    | blk.0.attn_q.weight
+#     12288 | 12288,      1,     1,     1 | F32     | blk.0.attn_q.bias
+#      1024 |  1024,      1,     1,     1 | F32     | blk.0.attn_v.bias
+#
+# 738197504 |  1408,   4096,   128,     1 | Q8_0    | blk.1.ffn_down_exps.weight
+# 738197504 |  4096,   1408,   128,     1 | Q8_0    | blk.1.ffn_gate_exps.weight
+# 738197504 |  4096,   1408,   128,     1 | Q8_0    | blk.1.ffn_up_exps.weight
+#      4096 |  4096,      1,     1,     1 | F32     | blk.1.attn_norm.weight
+#       128 |   128,      1,     1,     1 | F32     | blk.1.ffn_gate_inp.bias
+#    524288 |  4096,    128,     1,     1 | F32     | blk.1.ffn_gate_inp.weight
+#   5767168 |  1408,   4096,     1,     1 | Q8_0    | blk.1.ffn_down_shexp.weight
+#   5767168 |  4096,   1408,     1,     1 | Q8_0    | blk.1.ffn_gate_shexp.weight
+#   5767168 |  4096,   1408,     1,     1 | Q8_0    | blk.1.ffn_up_shexp.weight
+#   4194304 |  4096,   1024,     1,     1 | Q8_0    | blk.1.attn_k.weight
+#  50331648 | 12288,   4096,     1,     1 | Q8_0    | blk.1.attn_output.weight
+#  50331648 |  4096,  12288,     1,     1 | Q8_0    | blk.1.attn_q.weight
+#   4194304 |  4096,   1024,     1,     1 | Q8_0    | blk.1.attn_v.weight
+#      4096 |  4096,      1,     1,     1 | F32     | blk.1.ffn_norm.weight
+#      1024 |  1024,      1,     1,     1 | F32     | blk.1.attn_k.bias
+#     12288 | 12288,      1,     1,     1 | F32     | blk.1.attn_q.bias
+#      1024 |  1024,      1,     1,     1 | F32     | blk.1.attn_v.bias
+# 620756992 |  4096, 151552,     1,     1 | Q8_0    | output.weight
+custom="
+# 47 Repeating Layers [0-46]
+# Attention
+#blk\.(0)\.attn_q.*=q8_0
+#blk\.(0)\.attn_k.*=q8_0
+#blk\.(0)\.attn_v.*=q8_0
+#blk\.(0)\.attn_output.*=q8_0
+blk\..*\.attn_q.*=iq5_ks
+blk\..*\.attn_k.*=iq5_ks
+blk\..*\.attn_v.*=iq5_ks
+blk\..*\.attn_output.*=iq5_ks
+# First 1 Dense Layers [0]
+blk\..*\.ffn_down\.weight=q6_0
+blk\..*\.ffn_(gate|up)\.weight=iq5_ks
+# Shared Expert Layers [2-46]
+blk\..*\.ffn_down_shexp\.weight=q6_0
+blk\..*\.ffn_(gate|up)_shexp\.weight=iq5_ks
+# Routed Experts Layers [2-46]
+#blk\.(3|92)\.ffn_down_exps\.weight=q8_0
+#blk\.(3|92)\.ffn_(gate|up)_exps\.weight=q8_0
+blk\..*\.ffn_down_exps\.weight=iq4_nl
+blk\..*\.ffn_(gate|up)_exps\.weight=iq4_kss
+# Non-Repeating Layers
+token_embd\.weight=iq4_k
+output\.weight=iq6_k
+"
+custom=$(
+  echo "$custom" | grep -v '^#' | \
+  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
+)
+numactl -N 1 -m 1 \
+./build/bin/llama-quantize \
+    --custom-q "$custom" \
+    --imatrix /mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/imatrix-GLM-4.5-Air-BF16.dat \
+    /mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/GLM-4.5-Air-128x8.1B-BF16-00001-of-00005.gguf \
+    /mnt/raid/models/ubergarm/GLM-4.5-Air-GGUF/GLM-4.5-Air-IQ4_KSS.gguf \
+    IQ4_KSS \
+    192
+```
+</details>
 ## Quick Start
 ```bash
+# Clone and checkout experimental PR
+$ git clone https://github.com/ikawrakow/ik_llama.cpp
+$ cd ik_llama.cpp
+$ git remote add Thireus https://github.com/Thireus/ik_llama.cpp.git
+$ git checkout glm-4.5-clean
+# Build for hybrid CPU+CUDA
+$ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON -DGGML_BLAS=OFF -DGGML_SCHED_MAX_COPIES=1
+$ cmake --build build --config Release -j $(nproc)
+# Test Experimental GGUF
+$ ./build/bin/llama-server \
+    --model WARNING-EXPERIMENTAL-IKLLAMACPP-ONLY-GLM-4.5-Air-IQ4_KSS-00001-of-00002.gguf \
+    --alias ubergarm/GLM-4.5-Air-IQ4_KSS \
+    --ctx-size 32768 \
+    -fa -fmoe \
+    -ctk q8_0 -ctv q8_0 \
+    --chat-template chatglm4 \
+    -ub 4096 -b 4096 \
+    -ngl 99 \
+    -ot exps=CPU \
+    --parallel 1 \
+    --threads 8 \
+    --host 127.0.0.1 \
+    --port 8080 \
+    --no-mmap
 ```
 ## References