AndreHathora commited on
Commit
6e06b7a
·
1 Parent(s): 8bcba7b

Add enhanced KV cache calculator with GQA/MHA detection and fp4 support

Browse files

- Added support for GQA vs MHA detection and display
- Implemented fp4 data type support (MXFP4)
- Enhanced model configuration display with calculation formulas
- Set Qwen3-30B-A3B as default model
- Added proper attribution to gaunernst's original implementation
- Optimized interface for iframe embedding in blogs

Files changed (3) hide show
  1. README.md +27 -5
  2. app.py +125 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,13 +1,35 @@
1
  ---
2
  title: LLM KV Cache Calculator
3
- emoji: 👀
4
- colorFrom: pink
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.45.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: KV cache calculator for LLMs
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: LLM KV Cache Calculator
3
+ emoji: 💻
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.45.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Calculate KV cache memory requirements for transformer models with support for MHA, GQA, and MLA attention mechanisms
11
  ---
12
 
13
+ # KV Cache Calculator
14
+
15
+ Calculate KV cache memory requirements for transformer models.
16
+
17
+ ## Credits
18
+
19
+ This implementation is derived from and builds upon the excellent work by [gaunernst](https://huggingface.co/spaces/gaunernst/kv-cache-calculator). Special thanks for the original implementation!
20
+
21
+ ## Features
22
+
23
+ - **Multi-attention support**: MHA (Multi-Head Attention), GQA (Grouped Query Attention), and MLA (Multi-head Latent Attention)
24
+ - **Multiple data types**: fp16/bf16, fp8, and fp4 quantization
25
+ - **Real-time calculation**: Instant memory requirement estimates
26
+ - **Model analysis**: Detailed breakdown of model configuration
27
+ - **Universal compatibility**: Works with any HuggingFace transformer model
28
+
29
+ ## Usage
30
+
31
+ 1. Enter your model ID (e.g., "Qwen/Qwen3-30B-A3B")
32
+ 2. Set context length and number of users
33
+ 3. Choose data type precision
34
+ 4. Add HuggingFace token if needed for gated models
35
+ 5. Click calculate to get memory requirements
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoConfig
3
+
4
+ # Credits: This implementation is derived from and builds upon the excellent work by gaunernst
5
+ # Original implementation: https://huggingface.co/spaces/gaunernst/kv-cache-calculator
6
+
7
+
8
+ def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
9
+ hf_token = hf_token.strip()
10
+ try:
11
+ cfg = AutoConfig.from_pretrained(
12
+ name,
13
+ trust_remote_code=True,
14
+ token=hf_token or None,
15
+ )
16
+ except Exception as e:
17
+ raise gr.Error(e)
18
+
19
+ use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
20
+
21
+ if hasattr(cfg, "text_config"):
22
+ cfg = cfg.text_config
23
+
24
+ num_layers = cfg.num_hidden_layers
25
+
26
+ # Determine attention mechanism type
27
+ num_attention_heads = cfg.num_attention_heads
28
+ num_kv_heads = getattr(cfg, "num_key_value_heads", num_attention_heads)
29
+
30
+ if use_mla:
31
+ attention_type = "MLA"
32
+ elif num_kv_heads == num_attention_heads:
33
+ attention_type = "MHA"
34
+ else:
35
+ attention_type = "GQA"
36
+
37
+ model_config = [
38
+ ["num_layers", num_layers],
39
+ ["max_ctx_len", cfg.max_position_embeddings],
40
+ ["attention_type", attention_type],
41
+ ["num_attention_heads", num_attention_heads],
42
+ ["num_kv_heads", num_kv_heads],
43
+ ]
44
+ if ctx_len > cfg.max_position_embeddings:
45
+ gr.Warning(
46
+ "Requested context length is larger than the max value supported by the model"
47
+ )
48
+
49
+ # Calculate KV cache elements per token based on attention mechanism
50
+ if use_mla:
51
+ kv_lora_rank = cfg.kv_lora_rank
52
+ qk_rope_head_dim = cfg.qk_rope_head_dim
53
+ nelems_per_token = num_layers * (kv_lora_rank + qk_rope_head_dim)
54
+
55
+ model_config.append(["kv_lora_rank", kv_lora_rank])
56
+ model_config.append(["qk_rope_head_dim", qk_rope_head_dim])
57
+ model_config.append(["calc_formula", f"{num_layers} * ({kv_lora_rank} + {qk_rope_head_dim})"])
58
+
59
+ else:
60
+ head_dim = getattr(cfg, "head_dim", cfg.hidden_size // num_attention_heads)
61
+ nelems_per_token = num_layers * num_kv_heads * head_dim * 2 # 2 for key and value
62
+
63
+ model_config.append(["head_dim", head_dim])
64
+ if attention_type == "GQA":
65
+ kv_ratio = num_attention_heads // num_kv_heads
66
+ model_config.append(["gqa_ratio", f"{kv_ratio}:1"])
67
+ model_config.append(["calc_formula", f"{num_layers} * {num_kv_heads} * {head_dim} * 2"])
68
+
69
+ if dtype == "fp16/bf16":
70
+ nbytes_per_elem = 2
71
+ elif dtype == "fp8":
72
+ nbytes_per_elem = 1 + 2 / cfg.hidden_size # assume per-token scaling
73
+ elif dtype == "fp4":
74
+ nbytes_per_elem = 0.5 + 2 / 32 # 4-bit weights + scaling factor every 32 elements (MXFP4)
75
+
76
+ kv_cache_size = nelems_per_token * ctx_len * num_users * nbytes_per_elem / 1e9
77
+ return kv_cache_size, model_config
78
+
79
+
80
+ # Minimal description for iframe embedding
81
+ DESCRIPTION = (
82
+ "Calculate KV cache memory requirements for transformer models. "
83
+ "Supports MHA, GQA, and MLA attention mechanisms with fp16/bf16, fp8, and fp4 data types."
84
+ )
85
+
86
+ demo = gr.Interface(
87
+ title="KV Cache Calculator",
88
+ description=DESCRIPTION,
89
+ fn=calculate,
90
+ inputs=[
91
+ gr.Textbox(label="Model ID", value="Qwen/Qwen3-30B-A3B", placeholder="e.g., Qwen/Qwen3-30B-A3B"),
92
+ gr.Number(label="Context Length", value=128_000, minimum=1),
93
+ gr.Number(label="Number of Users", value=1, minimum=1),
94
+ gr.Dropdown(label="KV Cache Data Type", choices=["fp16/bf16", "fp8", "fp4"], value="fp16/bf16"),
95
+ gr.Textbox(label="HuggingFace Token (optional)", type="password", placeholder="For gated models"),
96
+ ],
97
+ outputs=[
98
+ gr.Number(label="KV Cache Size (GB)", precision=2),
99
+ gr.Dataframe(
100
+ label="Model Configuration",
101
+ headers=["Parameter", "Value"],
102
+ datatype=["str", "str"],
103
+ wrap=True
104
+ ),
105
+ ],
106
+ theme=gr.themes.Soft(),
107
+ css="""
108
+ .gradio-container {
109
+ max-width: 800px !important;
110
+ margin: 0 auto !important;
111
+ }
112
+ """,
113
+ analytics_enabled=False,
114
+ )
115
+
116
+ if __name__ == "__main__":
117
+ demo.launch(
118
+ server_name="0.0.0.0",
119
+ server_port=7860,
120
+ share=False,
121
+ show_error=True,
122
+ # Enable embedding in iframes
123
+ allowed_paths=[],
124
+ app_kwargs={"docs_url": None, "redoc_url": None}
125
+ )
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ transformers