Commit
·
6e06b7a
1
Parent(s):
8bcba7b
Add enhanced KV cache calculator with GQA/MHA detection and fp4 support
Browse files- Added support for GQA vs MHA detection and display
- Implemented fp4 data type support (MXFP4)
- Enhanced model configuration display with calculation formulas
- Set Qwen3-30B-A3B as default model
- Added proper attribution to gaunernst's original implementation
- Optimized interface for iframe embedding in blogs
- README.md +27 -5
- app.py +125 -0
- requirements.txt +1 -0
README.md
CHANGED
@@ -1,13 +1,35 @@
|
|
1 |
---
|
2 |
title: LLM KV Cache Calculator
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.45.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
short_description: KV cache
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: LLM KV Cache Calculator
|
3 |
+
emoji: 💻
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.45.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
short_description: Calculate KV cache memory requirements for transformer models with support for MHA, GQA, and MLA attention mechanisms
|
11 |
---
|
12 |
|
13 |
+
# KV Cache Calculator
|
14 |
+
|
15 |
+
Calculate KV cache memory requirements for transformer models.
|
16 |
+
|
17 |
+
## Credits
|
18 |
+
|
19 |
+
This implementation is derived from and builds upon the excellent work by [gaunernst](https://huggingface.co/spaces/gaunernst/kv-cache-calculator). Special thanks for the original implementation!
|
20 |
+
|
21 |
+
## Features
|
22 |
+
|
23 |
+
- **Multi-attention support**: MHA (Multi-Head Attention), GQA (Grouped Query Attention), and MLA (Multi-head Latent Attention)
|
24 |
+
- **Multiple data types**: fp16/bf16, fp8, and fp4 quantization
|
25 |
+
- **Real-time calculation**: Instant memory requirement estimates
|
26 |
+
- **Model analysis**: Detailed breakdown of model configuration
|
27 |
+
- **Universal compatibility**: Works with any HuggingFace transformer model
|
28 |
+
|
29 |
+
## Usage
|
30 |
+
|
31 |
+
1. Enter your model ID (e.g., "Qwen/Qwen3-30B-A3B")
|
32 |
+
2. Set context length and number of users
|
33 |
+
3. Choose data type precision
|
34 |
+
4. Add HuggingFace token if needed for gated models
|
35 |
+
5. Click calculate to get memory requirements
|
app.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import AutoConfig
|
3 |
+
|
4 |
+
# Credits: This implementation is derived from and builds upon the excellent work by gaunernst
|
5 |
+
# Original implementation: https://huggingface.co/spaces/gaunernst/kv-cache-calculator
|
6 |
+
|
7 |
+
|
8 |
+
def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
|
9 |
+
hf_token = hf_token.strip()
|
10 |
+
try:
|
11 |
+
cfg = AutoConfig.from_pretrained(
|
12 |
+
name,
|
13 |
+
trust_remote_code=True,
|
14 |
+
token=hf_token or None,
|
15 |
+
)
|
16 |
+
except Exception as e:
|
17 |
+
raise gr.Error(e)
|
18 |
+
|
19 |
+
use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
|
20 |
+
|
21 |
+
if hasattr(cfg, "text_config"):
|
22 |
+
cfg = cfg.text_config
|
23 |
+
|
24 |
+
num_layers = cfg.num_hidden_layers
|
25 |
+
|
26 |
+
# Determine attention mechanism type
|
27 |
+
num_attention_heads = cfg.num_attention_heads
|
28 |
+
num_kv_heads = getattr(cfg, "num_key_value_heads", num_attention_heads)
|
29 |
+
|
30 |
+
if use_mla:
|
31 |
+
attention_type = "MLA"
|
32 |
+
elif num_kv_heads == num_attention_heads:
|
33 |
+
attention_type = "MHA"
|
34 |
+
else:
|
35 |
+
attention_type = "GQA"
|
36 |
+
|
37 |
+
model_config = [
|
38 |
+
["num_layers", num_layers],
|
39 |
+
["max_ctx_len", cfg.max_position_embeddings],
|
40 |
+
["attention_type", attention_type],
|
41 |
+
["num_attention_heads", num_attention_heads],
|
42 |
+
["num_kv_heads", num_kv_heads],
|
43 |
+
]
|
44 |
+
if ctx_len > cfg.max_position_embeddings:
|
45 |
+
gr.Warning(
|
46 |
+
"Requested context length is larger than the max value supported by the model"
|
47 |
+
)
|
48 |
+
|
49 |
+
# Calculate KV cache elements per token based on attention mechanism
|
50 |
+
if use_mla:
|
51 |
+
kv_lora_rank = cfg.kv_lora_rank
|
52 |
+
qk_rope_head_dim = cfg.qk_rope_head_dim
|
53 |
+
nelems_per_token = num_layers * (kv_lora_rank + qk_rope_head_dim)
|
54 |
+
|
55 |
+
model_config.append(["kv_lora_rank", kv_lora_rank])
|
56 |
+
model_config.append(["qk_rope_head_dim", qk_rope_head_dim])
|
57 |
+
model_config.append(["calc_formula", f"{num_layers} * ({kv_lora_rank} + {qk_rope_head_dim})"])
|
58 |
+
|
59 |
+
else:
|
60 |
+
head_dim = getattr(cfg, "head_dim", cfg.hidden_size // num_attention_heads)
|
61 |
+
nelems_per_token = num_layers * num_kv_heads * head_dim * 2 # 2 for key and value
|
62 |
+
|
63 |
+
model_config.append(["head_dim", head_dim])
|
64 |
+
if attention_type == "GQA":
|
65 |
+
kv_ratio = num_attention_heads // num_kv_heads
|
66 |
+
model_config.append(["gqa_ratio", f"{kv_ratio}:1"])
|
67 |
+
model_config.append(["calc_formula", f"{num_layers} * {num_kv_heads} * {head_dim} * 2"])
|
68 |
+
|
69 |
+
if dtype == "fp16/bf16":
|
70 |
+
nbytes_per_elem = 2
|
71 |
+
elif dtype == "fp8":
|
72 |
+
nbytes_per_elem = 1 + 2 / cfg.hidden_size # assume per-token scaling
|
73 |
+
elif dtype == "fp4":
|
74 |
+
nbytes_per_elem = 0.5 + 2 / 32 # 4-bit weights + scaling factor every 32 elements (MXFP4)
|
75 |
+
|
76 |
+
kv_cache_size = nelems_per_token * ctx_len * num_users * nbytes_per_elem / 1e9
|
77 |
+
return kv_cache_size, model_config
|
78 |
+
|
79 |
+
|
80 |
+
# Minimal description for iframe embedding
|
81 |
+
DESCRIPTION = (
|
82 |
+
"Calculate KV cache memory requirements for transformer models. "
|
83 |
+
"Supports MHA, GQA, and MLA attention mechanisms with fp16/bf16, fp8, and fp4 data types."
|
84 |
+
)
|
85 |
+
|
86 |
+
demo = gr.Interface(
|
87 |
+
title="KV Cache Calculator",
|
88 |
+
description=DESCRIPTION,
|
89 |
+
fn=calculate,
|
90 |
+
inputs=[
|
91 |
+
gr.Textbox(label="Model ID", value="Qwen/Qwen3-30B-A3B", placeholder="e.g., Qwen/Qwen3-30B-A3B"),
|
92 |
+
gr.Number(label="Context Length", value=128_000, minimum=1),
|
93 |
+
gr.Number(label="Number of Users", value=1, minimum=1),
|
94 |
+
gr.Dropdown(label="KV Cache Data Type", choices=["fp16/bf16", "fp8", "fp4"], value="fp16/bf16"),
|
95 |
+
gr.Textbox(label="HuggingFace Token (optional)", type="password", placeholder="For gated models"),
|
96 |
+
],
|
97 |
+
outputs=[
|
98 |
+
gr.Number(label="KV Cache Size (GB)", precision=2),
|
99 |
+
gr.Dataframe(
|
100 |
+
label="Model Configuration",
|
101 |
+
headers=["Parameter", "Value"],
|
102 |
+
datatype=["str", "str"],
|
103 |
+
wrap=True
|
104 |
+
),
|
105 |
+
],
|
106 |
+
theme=gr.themes.Soft(),
|
107 |
+
css="""
|
108 |
+
.gradio-container {
|
109 |
+
max-width: 800px !important;
|
110 |
+
margin: 0 auto !important;
|
111 |
+
}
|
112 |
+
""",
|
113 |
+
analytics_enabled=False,
|
114 |
+
)
|
115 |
+
|
116 |
+
if __name__ == "__main__":
|
117 |
+
demo.launch(
|
118 |
+
server_name="0.0.0.0",
|
119 |
+
server_port=7860,
|
120 |
+
share=False,
|
121 |
+
show_error=True,
|
122 |
+
# Enable embedding in iframes
|
123 |
+
allowed_paths=[],
|
124 |
+
app_kwargs={"docs_url": None, "redoc_url": None}
|
125 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
transformers
|