huckiyang's picture
[node] estimation
9e02dae
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
from typing import Dict, Tuple, List
# Model specifications (approximate parameter counts and memory requirements)
MODEL_SPECS = {
"LLaMA-2-7B": {"params": 7e9, "base_memory_gb": 14},
"LLaMA-2-13B": {"params": 13e9, "base_memory_gb": 26},
"LLaMA-2-70B": {"params": 70e9, "base_memory_gb": 140},
"LLaMA-3-8B": {"params": 8e9, "base_memory_gb": 16},
"LLaMA-3-70B": {"params": 70e9, "base_memory_gb": 140},
"LLaMA-3.1-8B": {"params": 8e9, "base_memory_gb": 16},
"LLaMA-3.1-70B": {"params": 70e9, "base_memory_gb": 140},
"LLaMA-3.1-405B": {"params": 405e9, "base_memory_gb": 810},
"Nemotron-4-340B": {"params": 340e9, "base_memory_gb": 680},
"Nemotron-4-15B": {"params": 15e9, "base_memory_gb": 30},
"Qwen2-0.5B": {"params": 0.5e9, "base_memory_gb": 1},
"Qwen2-1.5B": {"params": 1.5e9, "base_memory_gb": 3},
"Qwen2-7B": {"params": 7e9, "base_memory_gb": 14},
"Qwen2-72B": {"params": 72e9, "base_memory_gb": 144},
"Qwen2.5-0.5B": {"params": 0.5e9, "base_memory_gb": 1},
"Qwen2.5-1.5B": {"params": 1.5e9, "base_memory_gb": 3},
"Qwen2.5-7B": {"params": 7e9, "base_memory_gb": 14},
"Qwen2.5-14B": {"params": 14e9, "base_memory_gb": 28},
"Qwen2.5-32B": {"params": 32e9, "base_memory_gb": 64},
"Qwen2.5-72B": {"params": 72e9, "base_memory_gb": 144},
# Qwen Vision Language Models
"Qwen-VL": {"params": 9.6e9, "base_memory_gb": 20},
"Qwen-VL-Chat": {"params": 9.6e9, "base_memory_gb": 20},
"Qwen-VL-Plus": {"params": 12e9, "base_memory_gb": 25},
"Qwen-VL-Max": {"params": 30e9, "base_memory_gb": 65},
"Qwen2-VL-2B": {"params": 2e9, "base_memory_gb": 5},
"Qwen2-VL-7B": {"params": 8e9, "base_memory_gb": 18},
"Qwen2-VL-72B": {"params": 72e9, "base_memory_gb": 150},
# NVIDIA VILA Series
"VILA-1.5-3B": {"params": 3e9, "base_memory_gb": 7},
"VILA-1.5-8B": {"params": 8e9, "base_memory_gb": 18},
"VILA-1.5-13B": {"params": 13e9, "base_memory_gb": 28},
"VILA-1.5-40B": {"params": 40e9, "base_memory_gb": 85},
# Qwen Audio Models
"Qwen-Audio": {"params": 8e9, "base_memory_gb": 18},
"Qwen-Audio-Chat": {"params": 8e9, "base_memory_gb": 18},
"Qwen2-Audio-7B": {"params": 8e9, "base_memory_gb": 18},
# NVIDIA PhysicsNeMo Models
"PhysicsNeMo-FNO-Small": {"params": 1e6, "base_memory_gb": 0.5},
"PhysicsNeMo-FNO-Medium": {"params": 10e6, "base_memory_gb": 2},
"PhysicsNeMo-FNO-Large": {"params": 50e6, "base_memory_gb": 8},
"PhysicsNeMo-PINN-Small": {"params": 0.5e6, "base_memory_gb": 0.2},
"PhysicsNeMo-PINN-Medium": {"params": 5e6, "base_memory_gb": 1},
"PhysicsNeMo-PINN-Large": {"params": 20e6, "base_memory_gb": 4},
"PhysicsNeMo-GraphCast-Small": {"params": 50e6, "base_memory_gb": 8},
"PhysicsNeMo-GraphCast-Medium": {"params": 200e6, "base_memory_gb": 20},
"PhysicsNeMo-GraphCast-Large": {"params": 1e9, "base_memory_gb": 50},
"PhysicsNeMo-SFNO-Small": {"params": 25e6, "base_memory_gb": 5},
"PhysicsNeMo-SFNO-Medium": {"params": 100e6, "base_memory_gb": 15},
"PhysicsNeMo-SFNO-Large": {"params": 500e6, "base_memory_gb": 35},
}
# H100 specifications
H100_MEMORY_GB = 80 # Memory per GPU
H100_GPUS_PER_NODE = 8 # GPUs per node
H100_NODE_MEMORY_GB = H100_MEMORY_GB * H100_GPUS_PER_NODE # 640GB per node
H100_COMPUTE_CAPABILITY = "9.0"
# CUDA version recommendations based on model and use case
CUDA_RECOMMENDATIONS = {
"inference": {
"recommended": "12.1+",
"minimum": "11.8",
"optimal": "12.4"
},
"training": {
"recommended": "12.1+",
"minimum": "11.8",
"optimal": "12.4"
},
"fine_tuning": {
"recommended": "12.1+",
"minimum": "11.8",
"optimal": "12.4"
}
}
def calculate_kv_cache_memory(num_tokens: int, model_params: float, num_layers: int = None) -> float:
"""Calculate KV cache memory requirements in GB"""
if num_layers is None:
# Estimate layers based on model size
if model_params < 1e9:
num_layers = 24
elif model_params < 10e9:
num_layers = 32
elif model_params < 100e9:
num_layers = 80
else:
num_layers = 96
# KV cache memory per token (approximate)
# 2 (K + V) * 2 (fp16) * hidden_dim * num_layers
hidden_dim = int((model_params / (num_layers * 4)) ** 0.5) * 64 # Rough estimate
kv_memory_per_token = 2 * 2 * hidden_dim * num_layers / (1024**3) # GB
return num_tokens * kv_memory_per_token
def estimate_h100_nodes(
model_name: str,
input_tokens: int,
output_tokens: int,
batch_size: int,
use_case: str,
precision: str
) -> Tuple[int, str, Dict]:
"""
Estimate the number of H100 nodes required
Returns:
- Number of nodes required
- Detailed explanation
- Dictionary with breakdown
"""
if model_name not in MODEL_SPECS:
return 1, f"Model {model_name} not found in specifications", {}
model_spec = MODEL_SPECS[model_name]
base_memory = model_spec["base_memory_gb"]
# Adjust memory based on precision
precision_multiplier = {
"FP32": 1.0,
"FP16": 0.5,
"BF16": 0.5,
"INT8": 0.25,
"INT4": 0.125
}
model_memory = base_memory * precision_multiplier.get(precision, 0.5)
# Calculate KV cache memory
total_tokens = input_tokens + output_tokens
kv_cache_memory = calculate_kv_cache_memory(total_tokens, model_spec["params"]) * batch_size
# Use case specific memory overhead
overhead_multiplier = {
"inference": 1.2, # 20% overhead
"training": 3.0, # 3x for gradients, optimizer states
"fine_tuning": 2.5 # 2.5x for fine-tuning
}
total_memory_per_instance = (model_memory + kv_cache_memory) * overhead_multiplier.get(use_case, 1.2)
# Calculate nodes needed
memory_per_node = H100_NODE_MEMORY_GB * 0.9 # Reserve 10% for system (576GB usable per node)
nodes_needed = max(1, int(np.ceil(total_memory_per_instance / memory_per_node)))
# For very large models, consider model parallelism
if model_memory > memory_per_node:
min_nodes_for_model = int(np.ceil(model_memory / memory_per_node))
nodes_needed = max(nodes_needed, min_nodes_for_model)
# Generate explanation
explanation = f"""
**Estimation Breakdown:**
• **Model**: {model_name} ({model_spec['params']/1e9:.1f}B parameters)
• **Precision**: {precision}
• **Model Memory**: {model_memory:.1f} GB
• **KV Cache Memory**: {kv_cache_memory:.1f} GB (for {total_tokens:,} tokens × {batch_size} batch size)
• **Use Case Overhead**: {overhead_multiplier.get(use_case, 1.2):.1f}x ({use_case})
• **Total Memory Required**: {total_memory_per_instance:.1f} GB
• **H100 Node Specs**: {H100_GPUS_PER_NODE} × {H100_MEMORY_GB}GB = {H100_NODE_MEMORY_GB}GB per node
• **Usable Memory**: {memory_per_node:.1f} GB per node (10% reserved)
**Recommendation**: {nodes_needed} H100 node(s) ({nodes_needed * H100_GPUS_PER_NODE} H100 GPUs total)
"""
breakdown = {
"model_memory_gb": model_memory,
"kv_cache_memory_gb": kv_cache_memory,
"total_memory_gb": total_memory_per_instance,
"h100_memory_per_node_gb": memory_per_node,
"nodes_required": nodes_needed
}
return nodes_needed, explanation, breakdown
def get_cuda_recommendation(use_case: str) -> str:
"""Get CUDA version recommendation based on use case"""
cuda_info = CUDA_RECOMMENDATIONS.get(use_case, CUDA_RECOMMENDATIONS["inference"])
recommendation = f"""
**CUDA Version Recommendations for {use_case.title()}:**
• **Optimal**: CUDA {cuda_info['optimal']} + cuDNN 8.9+
• **Recommended**: CUDA {cuda_info['recommended']} + cuDNN 8.7+
• **Minimum**: CUDA {cuda_info['minimum']} + cuDNN 8.5+
**Additional Requirements:**
• **Driver Version**: 525.60.13+ (Linux) / 527.41+ (Windows)
• **Compute Capability**: {H100_COMPUTE_CAPABILITY} (H100 native)
• **Node Configuration**: {H100_GPUS_PER_NODE} × H100 GPUs per node ({H100_NODE_MEMORY_GB}GB total)
• **Memory**: ECC enabled recommended for production
"""
return recommendation
def create_performance_chart(breakdown: Dict) -> plt.Figure:
"""Create a memory utilization chart"""
if not breakdown:
fig, ax = plt.subplots(figsize=(8, 6))
ax.text(0.5, 0.5, 'No data to display', ha='center', va='center')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
return fig
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Memory breakdown pie chart
labels = ['Model Memory', 'KV Cache', 'Overhead']
model_mem = breakdown['model_memory_gb']
kv_mem = breakdown['kv_cache_memory_gb']
overhead_mem = breakdown['total_memory_gb'] - model_mem - kv_mem
sizes = [model_mem, kv_mem, overhead_mem]
colors = ['#ff9999', '#66b3ff', '#99ff99']
ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
ax1.set_title('Memory Breakdown')
# Node utilization bar chart
nodes = breakdown['nodes_required']
total_memory = breakdown['total_memory_gb']
memory_per_node = breakdown['h100_memory_per_node_gb']
node_labels = [f'Node {i+1}' for i in range(nodes)]
utilization = []
for i in range(nodes):
if i < nodes - 1:
utilization.append(memory_per_node)
else:
remaining_memory = total_memory - (nodes - 1) * memory_per_node
utilization.append(remaining_memory)
utilization_pct = [u / memory_per_node * 100 for u in utilization]
bars = ax2.bar(node_labels, utilization_pct, color='skyblue', alpha=0.7)
ax2.axhline(y=100, color='red', linestyle='--', alpha=0.7, label='Max Capacity')
ax2.set_ylabel('Memory Utilization (%)')
ax2.set_title('H100 Node Memory Utilization')
ax2.set_ylim(0, 110)
ax2.legend()
# Add value labels on bars
for bar, pct in zip(bars, utilization_pct):
ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
f'{pct:.1f}%', ha='center', va='bottom')
plt.tight_layout()
return fig
def estimate_nodes_interface(
model_name: str,
input_tokens: int,
output_tokens: int,
batch_size: int,
use_case: str,
precision: str
):
"""Main interface function"""
# Validate inputs
if input_tokens <= 0 or output_tokens <= 0:
return "Please enter valid token counts (> 0)", "", None, "## ⚠️ <span style='color: #E74C3C;'>**Invalid Input: Token counts must be > 0**</span>"
if batch_size <= 0:
return "Please enter a valid batch size (> 0)", "", None, "## ⚠️ <span style='color: #E74C3C;'>**Invalid Input: Batch size must be > 0**</span>"
# Calculate node requirements
nodes_needed, explanation, breakdown = estimate_h100_nodes(
model_name, input_tokens, output_tokens, batch_size, use_case, precision
)
# Get CUDA recommendations
cuda_rec = get_cuda_recommendation(use_case)
# Create performance chart
fig = create_performance_chart(breakdown)
return explanation, cuda_rec, fig, f"## 🖥️ <span style='color: #4A90E2;'>**Estimated H100 Nodes Required: {nodes_needed}**</span>"
# Create Gradio interface
def create_interface():
with gr.Blocks(title="H100 Node Estimator", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🚀 H100 Node & CUDA Version Estimator")
gr.Markdown("Get recommendations for H100 node count and CUDA version based on your model and workload requirements.")
gr.Markdown("**Comprehensive Model Support**: LLaMA, Nemotron, Qwen2/2.5, Qwen-VL, VILA, Qwen-Audio, and **NVIDIA PhysicsNeMo** series!")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Input Parameters")
model_dropdown = gr.Dropdown(
choices=list(MODEL_SPECS.keys()),
value="LLaMA-3-8B",
label="Model",
info="Select the model you want to run (includes LLMs, multimodal, and physics-ML models)"
)
input_tokens = gr.Number(
value=2048,
label="Input Tokens",
info="Number of input tokens per request"
)
output_tokens = gr.Number(
value=512,
label="Output Tokens",
info="Number of output tokens per request"
)
batch_size = gr.Number(
value=1,
label="Batch Size",
info="Number of concurrent requests"
)
use_case = gr.Dropdown(
choices=["inference", "training", "fine_tuning"],
value="inference",
label="Use Case",
info="What will you use the model for?"
)
precision = gr.Dropdown(
choices=["FP32", "FP16", "BF16", "INT8", "INT4"],
value="FP16",
label="Precision",
info="Model precision/quantization"
)
estimate_btn = gr.Button("💡 Estimate Requirements", variant="primary")
with gr.Column(scale=2):
gr.Markdown("## Results")
node_count = gr.Markdown("## 🖥️ <span style='color: #4A90E2;'>**Ready to estimate...**</span>")
with gr.Tab("📊 Detailed Analysis"):
detailed_output = gr.Markdown()
with gr.Tab("🔧 CUDA Recommendations"):
cuda_output = gr.Markdown()
with gr.Tab("📈 Memory Utilization"):
chart_output = gr.Plot()
# Connect the interface
estimate_btn.click(
fn=estimate_nodes_interface,
inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision],
outputs=[detailed_output, cuda_output, chart_output, node_count]
)
# Add examples
gr.Markdown("## 💡 Example Scenarios")
examples = [
["LLaMA-3-8B", 2048, 512, 1, "inference", "FP16"],
["LLaMA-3-70B", 4096, 1024, 4, "inference", "FP16"],
["Qwen2.5-72B", 8192, 2048, 2, "fine_tuning", "BF16"],
["Nemotron-4-340B", 2048, 1024, 1, "inference", "INT8"],
["Qwen2-VL-7B", 1024, 256, 1, "inference", "FP16"],
["VILA-1.5-13B", 2048, 512, 2, "inference", "BF16"],
["Qwen2-Audio-7B", 1024, 256, 1, "inference", "FP16"],
["PhysicsNeMo-FNO-Large", 512, 128, 8, "training", "FP32"],
["PhysicsNeMo-GraphCast-Medium", 1024, 256, 4, "training", "FP16"],
]
gr.Examples(
examples=examples,
inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision],
outputs=[detailed_output, cuda_output, chart_output, node_count],
fn=estimate_nodes_interface,
cache_examples=False
)
gr.Markdown("""
## ℹ️ Notes
- **Multimodal Models**: Vision-language and audio models may require additional memory for image/audio processing
- **PhysicsNeMo Models**: Physics-ML models (FNO, PINN, GraphCast, SFNO) typically require higher batch sizes for training
- **Token Estimation**: For multimodal models, consider image patches (~256-1024 tokens per image) and audio frames
- **Physics Simulations**: PhysicsNeMo models often work with spatial/temporal grids rather than tokens
- Estimates are approximate and may vary based on actual implementation details
- Memory calculations include model weights, KV cache, and operational overhead
- Consider network bandwidth and storage requirements for multi-node setups
- For production deployments, add 10-20% buffer for optimal performance
""")
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(share=True, server_name="0.0.0.0", server_port=7860)