Spaces:

HathoraResearch
/

LLM-KV-cache-calculator

Running

AndreHathora commited on Sep 10

Commit

baf381a

1 Parent(s): fb095c2

Implement real-time HuggingFace Hub search functionality

- Added live search of entire HF Hub database via API
- Implemented caching system for better performance
- Fixed textbox glitching by removing feedback loop
- Search now returns actual models from HF Hub, not just filtered static list
- Enhanced search with multi-tier approach (text-generation + broader search)
- Popular models prioritized in search results
- Added huggingface_hub dependency for API access

Files changed (2) hide show

app.py +169 -35
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,9 +1,101 @@
 import gradio as gr
 from transformers import AutoConfig
 # Credits: This implementation is derived from and builds upon the excellent work by gaunernst
 # Original implementation: https://huggingface.co/spaces/gaunernst/kv-cache-calculator
 def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
     hf_token = hf_token.strip()
@@ -22,8 +114,6 @@ def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str
         cfg = cfg.text_config
     num_layers = cfg.num_hidden_layers
-    # Determine attention mechanism type
     num_attention_heads = cfg.num_attention_heads
     num_kv_heads = getattr(cfg, "num_key_value_heads", num_attention_heads)
@@ -46,7 +136,6 @@ def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str
             "Requested context length is larger than the max value supported by the model"
         )
-    # Calculate KV cache elements per token based on attention mechanism
     if use_mla:
         kv_lora_rank = cfg.kv_lora_rank
         qk_rope_head_dim = cfg.qk_rope_head_dim
@@ -58,7 +147,7 @@ def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str
     else:
         head_dim = getattr(cfg, "head_dim", cfg.hidden_size // num_attention_heads)
-        nelems_per_token = num_layers * num_kv_heads * head_dim * 2  # 2 for key and value
         model_config.append(["head_dim", head_dim])
         if attention_type == "GQA":
@@ -77,41 +166,87 @@ def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str
     return kv_cache_size, model_config
-# Minimal description for iframe embedding
 DESCRIPTION = (
     "Calculate KV cache memory requirements for transformer models. "
     "Supports MHA, GQA, and MLA attention mechanisms with fp16/bf16, fp8, and fp4 data types."
 )
-demo = gr.Interface(
-    title="KV Cache Calculator",
-    description=DESCRIPTION,
-    fn=calculate,
-    inputs=[
-        gr.Textbox(label="Model ID", value="Qwen/Qwen3-30B-A3B", placeholder="e.g., Qwen/Qwen3-30B-A3B"),
-        gr.Number(label="Context Length", value=128_000, minimum=1),
-        gr.Number(label="Number of Users", value=1, minimum=1),
-        gr.Dropdown(label="KV Cache Data Type", choices=["fp16/bf16", "fp8", "fp4"], value="fp16/bf16"),
-        gr.Textbox(label="HuggingFace Token (optional)", type="password", placeholder="For gated models"),
-    ],
-    outputs=[
-        gr.Number(label="KV Cache Size (GB)", precision=2),
-        gr.Dataframe(
-            label="Model Configuration",
-            headers=["Parameter", "Value"],
-            datatype=["str", "str"],
-            wrap=True
-        ),
-    ],
-    theme=gr.themes.Soft(),
-    css="""
-    .gradio-container {
-        max-width: 800px !important;
-        margin: 0 auto !important;
-    }
-    """,
-    analytics_enabled=False,
-)
 if __name__ == "__main__":
     demo.launch(
@@ -119,7 +254,6 @@ if __name__ == "__main__":
         server_port=7860,
         share=False,
         show_error=True,
-        # Enable embedding in iframes
         allowed_paths=[],
         app_kwargs={"docs_url": None, "redoc_url": None}
     )

 import gradio as gr
 from transformers import AutoConfig
+from huggingface_hub import list_models
+import asyncio
+from typing import List
+import time
+from functools import lru_cache
 # Credits: This implementation is derived from and builds upon the excellent work by gaunernst
 # Original implementation: https://huggingface.co/spaces/gaunernst/kv-cache-calculator
+search_cache = {}
+POPULAR_MODELS = [
+    "Qwen/Qwen3-30B-A3B",
+    "meta-llama/Llama-3.1-8B-Instruct",
+    "meta-llama/Llama-3.1-70B-Instruct",
+    "microsoft/DialoGPT-medium",
+    "microsoft/DialoGPT-large",
+    "mistralai/Mistral-7B-Instruct-v0.3",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "deepseek-ai/DeepSeek-V2-Chat",
+    "deepseek-ai/DeepSeek-V3-Base",
+    "google/gemma-2-9b",
+    "google/gemma-2-27b",
+    "Qwen/QwQ-32B-Preview",
+    "Qwen/Qwen2.5-72B-Instruct",
+    "anthropic/claude-3-haiku-20240307",
+]
+def search_models(query: str, max_results: int = 50) -> List[str]:
+    if not query or len(query.strip()) < 1:
+        return POPULAR_MODELS[:15]
+    query = query.strip()
+    cache_key = f"{query.lower()}_{max_results}"
+    current_time = time.time()
+    if cache_key in search_cache:
+        cached_result, cache_time = search_cache[cache_key]
+        if current_time - cache_time < 300:
+            return cached_result
+    try:
+        print(f"Searching HF Hub for: {query}")
+        models = list_models(
+            search=query,
+            task="text-generation",
+            library="transformers",
+            sort="downloads",
+            direction=-1,
+            limit=max_results * 2,
+            full=False
+        )
+        all_matches = []
+        seen_models = set()
+        for model in POPULAR_MODELS:
+            if query.lower() in model.lower() and model not in seen_models:
+                all_matches.append(model)
+                seen_models.add(model)
+        for model in models:
+            if model.id not in seen_models and len(all_matches) < max_results:
+                all_matches.append(model.id)
+                seen_models.add(model.id)
+        if len(all_matches) < max_results // 2:
+            try:
+                broader_models = list_models(
+                    search=query,
+                    library="transformers",
+                    sort="downloads",
+                    direction=-1,
+                    limit=max_results * 2
+                )
+                for model in broader_models:
+                    if model.id not in seen_models and len(all_matches) < max_results:
+                        model_id_lower = model.id.lower()
+                        if any(keyword in model_id_lower for keyword in ['chat', 'instruct', 'base', 'model']):
+                            all_matches.append(model.id)
+                            seen_models.add(model.id)
+            except Exception as e:
+                print(f"Broader search failed: {e}")
+        result = all_matches[:max_results]
+        search_cache[cache_key] = (result, current_time)
+        if len(search_cache) > 20:
+            oldest_key = min(search_cache.keys(), key=lambda k: search_cache[k][1])
+            del search_cache[oldest_key]
+        return result
+    except Exception as e:
+        print(f"Search error: {e}")
+        popular_matches = [model for model in POPULAR_MODELS if query.lower() in model.lower()]
+        return popular_matches if popular_matches else POPULAR_MODELS[:15]
 def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
     hf_token = hf_token.strip()
         cfg = cfg.text_config
     num_layers = cfg.num_hidden_layers
     num_attention_heads = cfg.num_attention_heads
     num_kv_heads = getattr(cfg, "num_key_value_heads", num_attention_heads)
             "Requested context length is larger than the max value supported by the model"
         )
     if use_mla:
         kv_lora_rank = cfg.kv_lora_rank
         qk_rope_head_dim = cfg.qk_rope_head_dim
     else:
         head_dim = getattr(cfg, "head_dim", cfg.hidden_size // num_attention_heads)
+        nelems_per_token = num_layers * num_kv_heads * head_dim * 2
         model_config.append(["head_dim", head_dim])
         if attention_type == "GQA":
     return kv_cache_size, model_config
 DESCRIPTION = (
     "Calculate KV cache memory requirements for transformer models. "
     "Supports MHA, GQA, and MLA attention mechanisms with fp16/bf16, fp8, and fp4 data types."
 )
+def search_and_update_models(query):
+    if not query or len(query.strip()) < 2:
+        return gr.Dropdown(choices=POPULAR_MODELS)
+    search_results = search_models(query.strip(), max_results=50)
+    if query.strip() not in search_results:
+        search_results.insert(0, query.strip())
+    return gr.Dropdown(choices=search_results, value=query.strip())
+with gr.Blocks(title="KV Cache Calculator", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# KV Cache Calculator")
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            model_search = gr.Textbox(
+                label="🔍 Search Models",
+                placeholder="Type model name (e.g., llama, qwen, mistral...)",
+                value="Qwen/Qwen3-30B-A3B",
+                info="Search the entire HuggingFace Hub database"
+            )
+            model_dropdown = gr.Dropdown(
+                label="📋 Select Model",
+                choices=POPULAR_MODELS,
+                value="Qwen/Qwen3-30B-A3B",
+                allow_custom_value=True,
+                info="Models matching your search - or type a custom model ID"
+            )
+            with gr.Row():
+                gr.Markdown("**💡 Tip:** Search updates the dropdown with real HF Hub results")
+            ctx_len = gr.Number(label="Context Length", value=128_000, minimum=1)
+            num_users = gr.Number(label="Number of Users", value=1, minimum=1)
+            dtype = gr.Dropdown(
+                label="KV Cache Data Type",
+                choices=["fp16/bf16", "fp8", "fp4"],
+                value="fp16/bf16"
+            )
+            hf_token = gr.Textbox(
+                label="HuggingFace Token (optional)",
+                type="password",
+                placeholder="For gated models"
+            )
+            calculate_btn = gr.Button("Calculate KV Cache Size", variant="primary")
+        with gr.Column():
+            cache_size = gr.Number(label="KV Cache Size (GB)", precision=2)
+            model_config = gr.Dataframe(
+                label="Model Configuration",
+                headers=["Parameter", "Value"],
+                datatype=["str", "str"],
+                wrap=True
+            )
+    model_search.change(
+        fn=search_and_update_models,
+        inputs=[model_search],
+        outputs=[model_dropdown],
+        show_progress=False
+    )
+    calculate_btn.click(
+        fn=calculate,
+        inputs=[model_dropdown, ctx_len, num_users, dtype, hf_token],
+        outputs=[cache_size, model_config]
+    )
+demo.css = """
+.gradio-container {
+    max-width: 1000px !important;
+    margin: 0 auto !important;
+}
+"""
 if __name__ == "__main__":
     demo.launch(
         server_port=7860,
         share=False,
         show_error=True,
         allowed_paths=[],
         app_kwargs={"docs_url": None, "redoc_url": None}
     )

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- transformers


1	+ transformers>=4.21.0
2	+ huggingface_hub>=0.16.0