Spaces:

HathoraResearch
/

LLM-KV-cache-calculator

Running

App Files Files Community

AndreHathora commited on Sep 11

Commit

a38c422

1 Parent(s): 160a197

updates

Browse files

Files changed (2) hide show

__pycache__/app.cpython-310.pyc +0 -0
app.py +52 -129

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (21 kB). View file

app.py CHANGED Viewed

@@ -5,9 +5,7 @@ import asyncio
 from typing import List
 import time
 from functools import lru_cache
-import requests
 import json
-import re
 from datetime import datetime, timedelta
 import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -59,6 +57,14 @@ GPU_SPECS = {
     "RTX 4080 Super": {"memory_gb": 16, "compute_capability": "8.9", "tflops_fp32": 52.2, "category": "Consumer"},
     "RTX 4090": {"memory_gb": 24, "compute_capability": "8.9", "tflops_fp32": 83.0, "category": "Consumer"},
     # Professional/Workstation RTX A Series (Ampere) - SM_8.6
     "RTX A2000": {"memory_gb": 12, "compute_capability": "8.6", "tflops_fp32": 8.0, "category": "Workstation"},
     "RTX A4000": {"memory_gb": 16, "compute_capability": "8.6", "tflops_fp32": 19.2, "category": "Workstation"},
@@ -81,7 +87,7 @@ GPU_SPECS = {
     "H200 141GB": {"memory_gb": 141, "compute_capability": "9.0", "tflops_fp32": 67.0, "category": "Datacenter"},
     # Datacenter B200 (Blackwell) - SM_10.0
-    "B200 192GB": {"memory_gb": 180, "compute_capability": "10.0", "tflops_fp32": 80.0, "category": "Datacenter"},
     # Datacenter L40/L40S (Ada Lovelace) - SM_8.9
     "L40": {"memory_gb": 48, "compute_capability": "8.9", "tflops_fp32": 91.6, "category": "Datacenter"},
@@ -96,9 +102,9 @@ def fetch_single_gpu_price(gpu_name):
     """Fetch price for a single GPU (used in parallel)"""
     try:
         print(f"Fetching price for {gpu_name}...")
-        price = get_gpu_price_from_multiple_sources(gpu_name)
         if price:
-            print(f"✓ Found price for {gpu_name}: ${price}")
             return gpu_name, price
         else:
             print(f"✗ No price found for {gpu_name}, using fallback")
@@ -109,7 +115,7 @@ def fetch_single_gpu_price(gpu_name):
 def preload_gpu_prices():
     """Pre-fetch all GPU prices in parallel on startup"""
-    print("🚀 Pre-loading GPU prices...")
     start_time = time.time()
     # Get list of GPUs to price
@@ -133,8 +139,8 @@ def preload_gpu_prices():
     end_time = time.time()
     total_time = end_time - start_time
-    print(f"✅ Loaded prices for {len(gpu_names)} GPUs in {total_time:.1f} seconds")
-    print(f"💰 Cache contains {len(price_cache)} price entries")
 def start_price_preloading():
     """Start price preloading in background thread"""
@@ -144,10 +150,10 @@ def start_price_preloading():
     # Start preloading in background
     preload_thread = threading.Thread(target=preload_worker, daemon=True)
     preload_thread.start()
-    print("🔄 Price preloading started in background...")
-def get_gpu_price_from_multiple_sources(gpu_name):
-    """Fetch GPU price from multiple sources with fallbacks"""
     current_time = datetime.now()
     # Check cache first
@@ -157,103 +163,19 @@ def get_gpu_price_from_multiple_sources(gpu_name):
         if current_time - cached_data["timestamp"] < PRICE_CACHE_DURATION:
             return cached_data["price"]
-    price = None
-    try:
-        gpu_specs = GPU_SPECS.get(gpu_name, {})
-        gpu_category = gpu_specs.get("category", "Consumer")
-        if gpu_category == "Datacenter":
-            price = get_fallback_price(gpu_name)
-        else:
-            price = fetch_newegg_price(gpu_name)
-            if not price:
-                price = fetch_amazon_price(gpu_name)
-            if not price:
-                price = get_fallback_price(gpu_name)
-    except Exception as e:
-        print(f"Error fetching price for {gpu_name}: {e}")
-        price = get_fallback_price(gpu_name)
     # Cache the result
-    if price:
-        price_cache[cache_key] = {
-            "price": price,
-            "timestamp": current_time
-        }
     return price
-def fetch_newegg_price(gpu_name):
-    """Fetch price from Newegg search (simplified approach)"""
-    try:
-        # Simple approach: search for GPU and extract price patterns
-        search_term = gpu_name.replace(" ", "+")
-        url = f"https://www.newegg.com/p/pl?d={search_term}"
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-        }
-        response = requests.get(url, headers=headers, timeout=2)
-        if response.status_code == 200:
-            # Look for price patterns in the HTML
-            price_patterns = [
-                r'\$([0-9,]+\.?\d*)',
-                r'price.*?(\d+[,.]?\d*)',
-                r'(\d{3,4})\.\d{2}'
-            ]
-            for pattern in price_patterns:
-                matches = re.findall(pattern, response.text)
-                if matches:
-                    # Get the first reasonable price (between $200-$3000)
-                    for match in matches:
-                        try:
-                            price = float(match.replace(',', ''))
-                            if 200 <= price <= 3000:
-                                return price
-                        except:
-                            continue
-    except:
-        pass
-    return None
-def fetch_amazon_price(gpu_name):
-    """Fetch price from Amazon search (simplified approach)"""
-    try:
-        search_term = gpu_name.replace(" ", "+")
-        url = f"https://www.amazon.com/s?k={search_term}+graphics+card"
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-        }
-        response = requests.get(url, headers=headers, timeout=2)
-        if response.status_code == 200:
-            # Look for Amazon price patterns
-            price_patterns = [
-                r'\$([0-9,]+\.?\d*)',
-                r'a-price-whole.*?(\d+)',
-            ]
-            for pattern in price_patterns:
-                matches = re.findall(pattern, response.text)
-                if matches:
-                    for match in matches:
-                        try:
-                            price = float(match.replace(',', ''))
-                            if 200 <= price <= 3000:
-                                return price
-                        except:
-                            continue
-    except:
-        pass
-    return None
 def get_fallback_price(gpu_name):
-    """Fallback prices based on typical market values (updated periodically)"""
     fallback_prices = {
         # Consumer RTX 30 Series
         "RTX 3060": 280,
@@ -277,6 +199,14 @@ def get_fallback_price(gpu_name):
         "RTX 4080 Super": 880,
         "RTX 4090": 1500,
         # Professional/Workstation GPUs
         "RTX A2000": 650,
         "RTX A4000": 1200,
@@ -285,15 +215,15 @@ def get_fallback_price(gpu_name):
         "RTX A6000": 4500,
         "RTX 6000 Ada": 6800,
-        # Datacenter GPUs (estimated enterprise pricing)
         "A100 40GB": 12000,
         "A100 80GB": 15000,
-        "H100 80GB": 28000,
-        "H100 94GB": 32000,
-        "H200 141GB": 35000,
-        "B200 192GB": 45000,
-        "L40": 8500,
-        "L40S": 9500,
     }
     return fallback_prices.get(gpu_name, 1000)
@@ -677,11 +607,6 @@ def calculate_multi_gpu_configs(total_memory_needed, suitable_gpus):
                 else:
                     config_name = f"{count}x {gpu['name']} (TP={count})"
-                category_emoji = {
-                    "Consumer": "🎮",
-                    "Workstation": "🏢",
-                    "Datacenter": "🏭"
-                }.get(gpu.get("category", "Consumer"), "🎮")
                 multi_gpu_configs.append({
                     "config": config_name,
@@ -691,7 +616,6 @@ def calculate_multi_gpu_configs(total_memory_needed, suitable_gpus):
                     "utilization": utilization,
                     "total_cost": total_cost,
                     "cost_per_tflop": cost_per_tflop_total,
-                    "category_emoji": category_emoji,
                     "base_gpu": gpu
                 })
@@ -724,7 +648,7 @@ def recommend_gpus(kv_cache_size_gb, config=None, dtype="fp16/bf16", ctx_len=128
     for gpu_name, specs in GPU_SPECS.items():
         # Get real-time price (will use cache if available)
-        current_price = get_gpu_price_from_multiple_sources(gpu_name)
         if current_price:
             cost_per_tflop = current_price / specs["tflops_fp32"]
             all_gpus.append({
@@ -758,23 +682,22 @@ def recommend_gpus(kv_cache_size_gb, config=None, dtype="fp16/bf16", ctx_len=128
     # Format recommendations
     recommendations = []
     for i, config in enumerate(multi_gpu_configs):
-        rank_icons = ["🥇", "🥈", "🥉", "🏅", "⭐", "💫", "🌟", "✨"]
-        rank = rank_icons[i] if i < len(rank_icons) else "💎"
-        price_source = "💲 Live" if config["base_gpu"]["name"].lower().replace(" ", "_") in price_cache else "📊 Est"
         # Format configuration display
-        if config["gpu_count"] == 1:
-            config_display = f"{rank} {config['category_emoji']} {config['config']}"
-            memory_display = f"{config['total_memory_gb']:.0f} GB"
-        else:
-            config_display = f"{rank} {config['category_emoji']} {config['config']}"
-            memory_display = f"{config['total_memory_gb']:.0f} GB ({config['utilization']:.0f}% util)"
         recommendations.append([
             config_display,
-            f"{total_memory_needed:.1f}GB required",
-            f"{price_source} ${config['total_cost']:.0f}"
         ])
     return recommendations
@@ -826,9 +749,9 @@ with gr.Blocks(title="KV Cache Calculator", theme=gr.themes.Soft()) as demo:
             )
             gpu_recommendations = gr.Dataframe(
-                label="💡 GPU Recommendations",
-                headers=["Configuration", "Memory Required", "Total Price"],
-                datatype=["str", "str", "str"],
                 wrap=False,
                 visible=False
             )

 from typing import List
 import time
 from functools import lru_cache
 import json
 from datetime import datetime, timedelta
 import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
     "RTX 4080 Super": {"memory_gb": 16, "compute_capability": "8.9", "tflops_fp32": 52.2, "category": "Consumer"},
     "RTX 4090": {"memory_gb": 24, "compute_capability": "8.9", "tflops_fp32": 83.0, "category": "Consumer"},
+    # Consumer RTX 50 Series (Blackwell - GB202/GB203/GB205/GB206/GB207) - SM_10.0
+    "RTX 5060": {"memory_gb": 8, "compute_capability": "10.0", "tflops_fp32": 18.5, "category": "Consumer"},
+    "RTX 5060 Ti": {"memory_gb": 16, "compute_capability": "10.0", "tflops_fp32": 28.2, "category": "Consumer"},
+    "RTX 5070": {"memory_gb": 12, "compute_capability": "10.0", "tflops_fp32": 35.1, "category": "Consumer"},
+    "RTX 5070 Ti": {"memory_gb": 16, "compute_capability": "10.0", "tflops_fp32": 48.3, "category": "Consumer"},
+    "RTX 5080": {"memory_gb": 16, "compute_capability": "10.0", "tflops_fp32": 60.5, "category": "Consumer"},
+    "RTX 5090": {"memory_gb": 32, "compute_capability": "10.0", "tflops_fp32": 125.0, "category": "Consumer"},
     # Professional/Workstation RTX A Series (Ampere) - SM_8.6
     "RTX A2000": {"memory_gb": 12, "compute_capability": "8.6", "tflops_fp32": 8.0, "category": "Workstation"},
     "RTX A4000": {"memory_gb": 16, "compute_capability": "8.6", "tflops_fp32": 19.2, "category": "Workstation"},
     "H200 141GB": {"memory_gb": 141, "compute_capability": "9.0", "tflops_fp32": 67.0, "category": "Datacenter"},
     # Datacenter B200 (Blackwell) - SM_10.0
+    "B200 180GB": {"memory_gb": 180, "compute_capability": "10.0", "tflops_fp32": 80.0, "category": "Datacenter"},
     # Datacenter L40/L40S (Ada Lovelace) - SM_8.9
     "L40": {"memory_gb": 48, "compute_capability": "8.9", "tflops_fp32": 91.6, "category": "Datacenter"},
     """Fetch price for a single GPU (used in parallel)"""
     try:
         print(f"Fetching price for {gpu_name}...")
+        price = get_gpu_price(gpu_name)
         if price:
+            print(f"Found price for {gpu_name}: ${price}")
             return gpu_name, price
         else:
             print(f"✗ No price found for {gpu_name}, using fallback")
 def preload_gpu_prices():
     """Pre-fetch all GPU prices in parallel on startup"""
+    print("Pre-loading GPU prices...")
     start_time = time.time()
     # Get list of GPUs to price
     end_time = time.time()
     total_time = end_time - start_time
+    print(f"Loaded prices for {len(gpu_names)} GPUs in {total_time:.1f} seconds")
+    print(f"Cache contains {len(price_cache)} price entries")
 def start_price_preloading():
     """Start price preloading in background thread"""
     # Start preloading in background
     preload_thread = threading.Thread(target=preload_worker, daemon=True)
     preload_thread.start()
+    print("Price preloading started in background...")
+def get_gpu_price(gpu_name):
+    """Get GPU price from curated pricing data"""
     current_time = datetime.now()
     # Check cache first
         if current_time - cached_data["timestamp"] < PRICE_CACHE_DURATION:
             return cached_data["price"]
+    price = get_fallback_price(gpu_name)
     # Cache the result
+    price_cache[cache_key] = {
+        "price": price,
+        "timestamp": current_time
+    }
     return price
 def get_fallback_price(gpu_name):
+    """Curated GPU pricing data"""
     fallback_prices = {
         # Consumer RTX 30 Series
         "RTX 3060": 280,
         "RTX 4080 Super": 880,
         "RTX 4090": 1500,
+        # Consumer RTX 50 Series (Expected pricing)
+        "RTX 5060": 400,
+        "RTX 5060 Ti": 600,
+        "RTX 5070": 800,
+        "RTX 5070 Ti": 1000,
+        "RTX 5080": 1200,
+        "RTX 5090": 2000,
         # Professional/Workstation GPUs
         "RTX A2000": 650,
         "RTX A4000": 1200,
         "RTX A6000": 4500,
         "RTX 6000 Ada": 6800,
+        # Datacenter GPUs (current enterprise pricing)
         "A100 40GB": 12000,
         "A100 80GB": 15000,
+        "H100 80GB": 30000,
+        "H100 94GB": 35000,
+        "H200 141GB": 40000,
+        "B200 180GB": 50000,
+        "L40": 9000,
+        "L40S": 10000,
     }
     return fallback_prices.get(gpu_name, 1000)
                 else:
                     config_name = f"{count}x {gpu['name']} (TP={count})"
                 multi_gpu_configs.append({
                     "config": config_name,
                     "utilization": utilization,
                     "total_cost": total_cost,
                     "cost_per_tflop": cost_per_tflop_total,
                     "base_gpu": gpu
                 })
     for gpu_name, specs in GPU_SPECS.items():
         # Get real-time price (will use cache if available)
+        current_price = get_gpu_price(gpu_name)
         if current_price:
             cost_per_tflop = current_price / specs["tflops_fp32"]
             all_gpus.append({
     # Format recommendations
     recommendations = []
     for i, config in enumerate(multi_gpu_configs):
+        rank = f"#{i+1}"
+        price_source = "Live" if config["base_gpu"]["name"].lower().replace(" ", "_") in price_cache else "Est"
         # Format configuration display
+        config_display = f"{rank} {config['config']}"
+        # Calculate FLOP/dollar (TFLOPS per dollar)
+        total_tflops = config["base_gpu"]["tflops_fp32"] * config["gpu_count"]
+        flops_per_dollar = total_tflops / config['total_cost']
         recommendations.append([
             config_display,
+            f"{flops_per_dollar:.3f}",
+            f"{total_memory_needed:.1f}GB",
+            f"${config['total_cost']:.0f}"
         ])
     return recommendations
             )
             gpu_recommendations = gr.Dataframe(
+                label="GPU Recommendations",
+                headers=["Configuration", "TFLOPS/$", "Memory", "Price"],
+                datatype=["str", "str", "str", "str"],
                 wrap=False,
                 visible=False
             )