import gradio as gr import pandas as pd import matplotlib.pyplot as plt import numpy as np import json from typing import Dict, Tuple, List # Model specifications (approximate parameter counts and memory requirements) MODEL_SPECS = { "LLaMA-2-7B": {"params": 7e9, "base_memory_gb": 14}, "LLaMA-2-13B": {"params": 13e9, "base_memory_gb": 26}, "LLaMA-2-70B": {"params": 70e9, "base_memory_gb": 140}, "LLaMA-3-8B": {"params": 8e9, "base_memory_gb": 16}, "LLaMA-3-70B": {"params": 70e9, "base_memory_gb": 140}, "LLaMA-3.1-8B": {"params": 8e9, "base_memory_gb": 16}, "LLaMA-3.1-70B": {"params": 70e9, "base_memory_gb": 140}, "LLaMA-3.1-405B": {"params": 405e9, "base_memory_gb": 810}, "Nemotron-4-340B": {"params": 340e9, "base_memory_gb": 680}, "Nemotron-4-15B": {"params": 15e9, "base_memory_gb": 30}, "Qwen2-0.5B": {"params": 0.5e9, "base_memory_gb": 1}, "Qwen2-1.5B": {"params": 1.5e9, "base_memory_gb": 3}, "Qwen2-7B": {"params": 7e9, "base_memory_gb": 14}, "Qwen2-72B": {"params": 72e9, "base_memory_gb": 144}, "Qwen2.5-0.5B": {"params": 0.5e9, "base_memory_gb": 1}, "Qwen2.5-1.5B": {"params": 1.5e9, "base_memory_gb": 3}, "Qwen2.5-7B": {"params": 7e9, "base_memory_gb": 14}, "Qwen2.5-14B": {"params": 14e9, "base_memory_gb": 28}, "Qwen2.5-32B": {"params": 32e9, "base_memory_gb": 64}, "Qwen2.5-72B": {"params": 72e9, "base_memory_gb": 144}, # Qwen Vision Language Models "Qwen-VL": {"params": 9.6e9, "base_memory_gb": 20}, "Qwen-VL-Chat": {"params": 9.6e9, "base_memory_gb": 20}, "Qwen-VL-Plus": {"params": 12e9, "base_memory_gb": 25}, "Qwen-VL-Max": {"params": 30e9, "base_memory_gb": 65}, "Qwen2-VL-2B": {"params": 2e9, "base_memory_gb": 5}, "Qwen2-VL-7B": {"params": 8e9, "base_memory_gb": 18}, "Qwen2-VL-72B": {"params": 72e9, "base_memory_gb": 150}, # NVIDIA VILA Series "VILA-1.5-3B": {"params": 3e9, "base_memory_gb": 7}, "VILA-1.5-8B": {"params": 8e9, "base_memory_gb": 18}, "VILA-1.5-13B": {"params": 13e9, "base_memory_gb": 28}, "VILA-1.5-40B": {"params": 40e9, "base_memory_gb": 85}, # Qwen Audio Models "Qwen-Audio": {"params": 8e9, "base_memory_gb": 18}, "Qwen-Audio-Chat": {"params": 8e9, "base_memory_gb": 18}, "Qwen2-Audio-7B": {"params": 8e9, "base_memory_gb": 18}, # NVIDIA PhysicsNeMo Models "PhysicsNeMo-FNO-Small": {"params": 1e6, "base_memory_gb": 0.5}, "PhysicsNeMo-FNO-Medium": {"params": 10e6, "base_memory_gb": 2}, "PhysicsNeMo-FNO-Large": {"params": 50e6, "base_memory_gb": 8}, "PhysicsNeMo-PINN-Small": {"params": 0.5e6, "base_memory_gb": 0.2}, "PhysicsNeMo-PINN-Medium": {"params": 5e6, "base_memory_gb": 1}, "PhysicsNeMo-PINN-Large": {"params": 20e6, "base_memory_gb": 4}, "PhysicsNeMo-GraphCast-Small": {"params": 50e6, "base_memory_gb": 8}, "PhysicsNeMo-GraphCast-Medium": {"params": 200e6, "base_memory_gb": 20}, "PhysicsNeMo-GraphCast-Large": {"params": 1e9, "base_memory_gb": 50}, "PhysicsNeMo-SFNO-Small": {"params": 25e6, "base_memory_gb": 5}, "PhysicsNeMo-SFNO-Medium": {"params": 100e6, "base_memory_gb": 15}, "PhysicsNeMo-SFNO-Large": {"params": 500e6, "base_memory_gb": 35}, } # H100 specifications H100_MEMORY_GB = 80 # Memory per GPU H100_GPUS_PER_NODE = 8 # GPUs per node H100_NODE_MEMORY_GB = H100_MEMORY_GB * H100_GPUS_PER_NODE # 640GB per node H100_COMPUTE_CAPABILITY = "9.0" # CUDA version recommendations based on model and use case CUDA_RECOMMENDATIONS = { "inference": { "recommended": "12.1+", "minimum": "11.8", "optimal": "12.4" }, "training": { "recommended": "12.1+", "minimum": "11.8", "optimal": "12.4" }, "fine_tuning": { "recommended": "12.1+", "minimum": "11.8", "optimal": "12.4" } } def calculate_kv_cache_memory(num_tokens: int, model_params: float, num_layers: int = None) -> float: """Calculate KV cache memory requirements in GB""" if num_layers is None: # Estimate layers based on model size if model_params < 1e9: num_layers = 24 elif model_params < 10e9: num_layers = 32 elif model_params < 100e9: num_layers = 80 else: num_layers = 96 # KV cache memory per token (approximate) # 2 (K + V) * 2 (fp16) * hidden_dim * num_layers hidden_dim = int((model_params / (num_layers * 4)) ** 0.5) * 64 # Rough estimate kv_memory_per_token = 2 * 2 * hidden_dim * num_layers / (1024**3) # GB return num_tokens * kv_memory_per_token def estimate_h100_nodes( model_name: str, input_tokens: int, output_tokens: int, batch_size: int, use_case: str, precision: str ) -> Tuple[int, str, Dict]: """ Estimate the number of H100 nodes required Returns: - Number of nodes required - Detailed explanation - Dictionary with breakdown """ if model_name not in MODEL_SPECS: return 1, f"Model {model_name} not found in specifications", {} model_spec = MODEL_SPECS[model_name] base_memory = model_spec["base_memory_gb"] # Adjust memory based on precision precision_multiplier = { "FP32": 1.0, "FP16": 0.5, "BF16": 0.5, "INT8": 0.25, "INT4": 0.125 } model_memory = base_memory * precision_multiplier.get(precision, 0.5) # Calculate KV cache memory total_tokens = input_tokens + output_tokens kv_cache_memory = calculate_kv_cache_memory(total_tokens, model_spec["params"]) * batch_size # Use case specific memory overhead overhead_multiplier = { "inference": 1.2, # 20% overhead "training": 3.0, # 3x for gradients, optimizer states "fine_tuning": 2.5 # 2.5x for fine-tuning } total_memory_per_instance = (model_memory + kv_cache_memory) * overhead_multiplier.get(use_case, 1.2) # Calculate nodes needed memory_per_node = H100_NODE_MEMORY_GB * 0.9 # Reserve 10% for system (576GB usable per node) nodes_needed = max(1, int(np.ceil(total_memory_per_instance / memory_per_node))) # For very large models, consider model parallelism if model_memory > memory_per_node: min_nodes_for_model = int(np.ceil(model_memory / memory_per_node)) nodes_needed = max(nodes_needed, min_nodes_for_model) # Generate explanation explanation = f""" **Estimation Breakdown:** • **Model**: {model_name} ({model_spec['params']/1e9:.1f}B parameters) • **Precision**: {precision} • **Model Memory**: {model_memory:.1f} GB • **KV Cache Memory**: {kv_cache_memory:.1f} GB (for {total_tokens:,} tokens × {batch_size} batch size) • **Use Case Overhead**: {overhead_multiplier.get(use_case, 1.2):.1f}x ({use_case}) • **Total Memory Required**: {total_memory_per_instance:.1f} GB • **H100 Node Specs**: {H100_GPUS_PER_NODE} × {H100_MEMORY_GB}GB = {H100_NODE_MEMORY_GB}GB per node • **Usable Memory**: {memory_per_node:.1f} GB per node (10% reserved) **Recommendation**: {nodes_needed} H100 node(s) ({nodes_needed * H100_GPUS_PER_NODE} H100 GPUs total) """ breakdown = { "model_memory_gb": model_memory, "kv_cache_memory_gb": kv_cache_memory, "total_memory_gb": total_memory_per_instance, "h100_memory_per_node_gb": memory_per_node, "nodes_required": nodes_needed } return nodes_needed, explanation, breakdown def get_cuda_recommendation(use_case: str) -> str: """Get CUDA version recommendation based on use case""" cuda_info = CUDA_RECOMMENDATIONS.get(use_case, CUDA_RECOMMENDATIONS["inference"]) recommendation = f""" **CUDA Version Recommendations for {use_case.title()}:** • **Optimal**: CUDA {cuda_info['optimal']} + cuDNN 8.9+ • **Recommended**: CUDA {cuda_info['recommended']} + cuDNN 8.7+ • **Minimum**: CUDA {cuda_info['minimum']} + cuDNN 8.5+ **Additional Requirements:** • **Driver Version**: 525.60.13+ (Linux) / 527.41+ (Windows) • **Compute Capability**: {H100_COMPUTE_CAPABILITY} (H100 native) • **Node Configuration**: {H100_GPUS_PER_NODE} × H100 GPUs per node ({H100_NODE_MEMORY_GB}GB total) • **Memory**: ECC enabled recommended for production """ return recommendation def create_performance_chart(breakdown: Dict) -> plt.Figure: """Create a memory utilization chart""" if not breakdown: fig, ax = plt.subplots(figsize=(8, 6)) ax.text(0.5, 0.5, 'No data to display', ha='center', va='center') ax.set_xlim(0, 1) ax.set_ylim(0, 1) return fig fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) # Memory breakdown pie chart labels = ['Model Memory', 'KV Cache', 'Overhead'] model_mem = breakdown['model_memory_gb'] kv_mem = breakdown['kv_cache_memory_gb'] overhead_mem = breakdown['total_memory_gb'] - model_mem - kv_mem sizes = [model_mem, kv_mem, overhead_mem] colors = ['#ff9999', '#66b3ff', '#99ff99'] ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90) ax1.set_title('Memory Breakdown') # Node utilization bar chart nodes = breakdown['nodes_required'] total_memory = breakdown['total_memory_gb'] memory_per_node = breakdown['h100_memory_per_node_gb'] node_labels = [f'Node {i+1}' for i in range(nodes)] utilization = [] for i in range(nodes): if i < nodes - 1: utilization.append(memory_per_node) else: remaining_memory = total_memory - (nodes - 1) * memory_per_node utilization.append(remaining_memory) utilization_pct = [u / memory_per_node * 100 for u in utilization] bars = ax2.bar(node_labels, utilization_pct, color='skyblue', alpha=0.7) ax2.axhline(y=100, color='red', linestyle='--', alpha=0.7, label='Max Capacity') ax2.set_ylabel('Memory Utilization (%)') ax2.set_title('H100 Node Memory Utilization') ax2.set_ylim(0, 110) ax2.legend() # Add value labels on bars for bar, pct in zip(bars, utilization_pct): ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{pct:.1f}%', ha='center', va='bottom') plt.tight_layout() return fig def estimate_nodes_interface( model_name: str, input_tokens: int, output_tokens: int, batch_size: int, use_case: str, precision: str ): """Main interface function""" # Validate inputs if input_tokens <= 0 or output_tokens <= 0: return "Please enter valid token counts (> 0)", "", None, "## ⚠️ **Invalid Input: Token counts must be > 0**" if batch_size <= 0: return "Please enter a valid batch size (> 0)", "", None, "## ⚠️ **Invalid Input: Batch size must be > 0**" # Calculate node requirements nodes_needed, explanation, breakdown = estimate_h100_nodes( model_name, input_tokens, output_tokens, batch_size, use_case, precision ) # Get CUDA recommendations cuda_rec = get_cuda_recommendation(use_case) # Create performance chart fig = create_performance_chart(breakdown) return explanation, cuda_rec, fig, f"## 🖥️ **Estimated H100 Nodes Required: {nodes_needed}**" # Create Gradio interface def create_interface(): with gr.Blocks(title="H100 Node Estimator", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🚀 H100 Node & CUDA Version Estimator") gr.Markdown("Get recommendations for H100 node count and CUDA version based on your model and workload requirements.") gr.Markdown("**Comprehensive Model Support**: LLaMA, Nemotron, Qwen2/2.5, Qwen-VL, VILA, Qwen-Audio, and **NVIDIA PhysicsNeMo** series!") with gr.Row(): with gr.Column(scale=1): gr.Markdown("## Input Parameters") model_dropdown = gr.Dropdown( choices=list(MODEL_SPECS.keys()), value="LLaMA-3-8B", label="Model", info="Select the model you want to run (includes LLMs, multimodal, and physics-ML models)" ) input_tokens = gr.Number( value=2048, label="Input Tokens", info="Number of input tokens per request" ) output_tokens = gr.Number( value=512, label="Output Tokens", info="Number of output tokens per request" ) batch_size = gr.Number( value=1, label="Batch Size", info="Number of concurrent requests" ) use_case = gr.Dropdown( choices=["inference", "training", "fine_tuning"], value="inference", label="Use Case", info="What will you use the model for?" ) precision = gr.Dropdown( choices=["FP32", "FP16", "BF16", "INT8", "INT4"], value="FP16", label="Precision", info="Model precision/quantization" ) estimate_btn = gr.Button("💡 Estimate Requirements", variant="primary") with gr.Column(scale=2): gr.Markdown("## Results") node_count = gr.Markdown("## 🖥️ **Ready to estimate...**") with gr.Tab("📊 Detailed Analysis"): detailed_output = gr.Markdown() with gr.Tab("🔧 CUDA Recommendations"): cuda_output = gr.Markdown() with gr.Tab("📈 Memory Utilization"): chart_output = gr.Plot() # Connect the interface estimate_btn.click( fn=estimate_nodes_interface, inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision], outputs=[detailed_output, cuda_output, chart_output, node_count] ) # Add examples gr.Markdown("## 💡 Example Scenarios") examples = [ ["LLaMA-3-8B", 2048, 512, 1, "inference", "FP16"], ["LLaMA-3-70B", 4096, 1024, 4, "inference", "FP16"], ["Qwen2.5-72B", 8192, 2048, 2, "fine_tuning", "BF16"], ["Nemotron-4-340B", 2048, 1024, 1, "inference", "INT8"], ["Qwen2-VL-7B", 1024, 256, 1, "inference", "FP16"], ["VILA-1.5-13B", 2048, 512, 2, "inference", "BF16"], ["Qwen2-Audio-7B", 1024, 256, 1, "inference", "FP16"], ["PhysicsNeMo-FNO-Large", 512, 128, 8, "training", "FP32"], ["PhysicsNeMo-GraphCast-Medium", 1024, 256, 4, "training", "FP16"], ] gr.Examples( examples=examples, inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision], outputs=[detailed_output, cuda_output, chart_output, node_count], fn=estimate_nodes_interface, cache_examples=False ) gr.Markdown(""" ## ℹ️ Notes - **Multimodal Models**: Vision-language and audio models may require additional memory for image/audio processing - **PhysicsNeMo Models**: Physics-ML models (FNO, PINN, GraphCast, SFNO) typically require higher batch sizes for training - **Token Estimation**: For multimodal models, consider image patches (~256-1024 tokens per image) and audio frames - **Physics Simulations**: PhysicsNeMo models often work with spatial/temporal grids rather than tokens - Estimates are approximate and may vary based on actual implementation details - Memory calculations include model weights, KV cache, and operational overhead - Consider network bandwidth and storage requirements for multi-node setups - For production deployments, add 10-20% buffer for optimal performance """) return demo if __name__ == "__main__": demo = create_interface() demo.launch(share=True, server_name="0.0.0.0", server_port=7860)