Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import json | |
from typing import Dict, Tuple, List | |
# Model specifications (approximate parameter counts and memory requirements) | |
MODEL_SPECS = { | |
"LLaMA-2-7B": {"params": 7e9, "base_memory_gb": 14}, | |
"LLaMA-2-13B": {"params": 13e9, "base_memory_gb": 26}, | |
"LLaMA-2-70B": {"params": 70e9, "base_memory_gb": 140}, | |
"LLaMA-3-8B": {"params": 8e9, "base_memory_gb": 16}, | |
"LLaMA-3-70B": {"params": 70e9, "base_memory_gb": 140}, | |
"LLaMA-3.1-8B": {"params": 8e9, "base_memory_gb": 16}, | |
"LLaMA-3.1-70B": {"params": 70e9, "base_memory_gb": 140}, | |
"LLaMA-3.1-405B": {"params": 405e9, "base_memory_gb": 810}, | |
"Nemotron-4-340B": {"params": 340e9, "base_memory_gb": 680}, | |
"Nemotron-4-15B": {"params": 15e9, "base_memory_gb": 30}, | |
"Qwen2-0.5B": {"params": 0.5e9, "base_memory_gb": 1}, | |
"Qwen2-1.5B": {"params": 1.5e9, "base_memory_gb": 3}, | |
"Qwen2-7B": {"params": 7e9, "base_memory_gb": 14}, | |
"Qwen2-72B": {"params": 72e9, "base_memory_gb": 144}, | |
"Qwen2.5-0.5B": {"params": 0.5e9, "base_memory_gb": 1}, | |
"Qwen2.5-1.5B": {"params": 1.5e9, "base_memory_gb": 3}, | |
"Qwen2.5-7B": {"params": 7e9, "base_memory_gb": 14}, | |
"Qwen2.5-14B": {"params": 14e9, "base_memory_gb": 28}, | |
"Qwen2.5-32B": {"params": 32e9, "base_memory_gb": 64}, | |
"Qwen2.5-72B": {"params": 72e9, "base_memory_gb": 144}, | |
# Qwen Vision Language Models | |
"Qwen-VL": {"params": 9.6e9, "base_memory_gb": 20}, | |
"Qwen-VL-Chat": {"params": 9.6e9, "base_memory_gb": 20}, | |
"Qwen-VL-Plus": {"params": 12e9, "base_memory_gb": 25}, | |
"Qwen-VL-Max": {"params": 30e9, "base_memory_gb": 65}, | |
"Qwen2-VL-2B": {"params": 2e9, "base_memory_gb": 5}, | |
"Qwen2-VL-7B": {"params": 8e9, "base_memory_gb": 18}, | |
"Qwen2-VL-72B": {"params": 72e9, "base_memory_gb": 150}, | |
# NVIDIA VILA Series | |
"VILA-1.5-3B": {"params": 3e9, "base_memory_gb": 7}, | |
"VILA-1.5-8B": {"params": 8e9, "base_memory_gb": 18}, | |
"VILA-1.5-13B": {"params": 13e9, "base_memory_gb": 28}, | |
"VILA-1.5-40B": {"params": 40e9, "base_memory_gb": 85}, | |
# Qwen Audio Models | |
"Qwen-Audio": {"params": 8e9, "base_memory_gb": 18}, | |
"Qwen-Audio-Chat": {"params": 8e9, "base_memory_gb": 18}, | |
"Qwen2-Audio-7B": {"params": 8e9, "base_memory_gb": 18}, | |
# NVIDIA PhysicsNeMo Models | |
"PhysicsNeMo-FNO-Small": {"params": 1e6, "base_memory_gb": 0.5}, | |
"PhysicsNeMo-FNO-Medium": {"params": 10e6, "base_memory_gb": 2}, | |
"PhysicsNeMo-FNO-Large": {"params": 50e6, "base_memory_gb": 8}, | |
"PhysicsNeMo-PINN-Small": {"params": 0.5e6, "base_memory_gb": 0.2}, | |
"PhysicsNeMo-PINN-Medium": {"params": 5e6, "base_memory_gb": 1}, | |
"PhysicsNeMo-PINN-Large": {"params": 20e6, "base_memory_gb": 4}, | |
"PhysicsNeMo-GraphCast-Small": {"params": 50e6, "base_memory_gb": 8}, | |
"PhysicsNeMo-GraphCast-Medium": {"params": 200e6, "base_memory_gb": 20}, | |
"PhysicsNeMo-GraphCast-Large": {"params": 1e9, "base_memory_gb": 50}, | |
"PhysicsNeMo-SFNO-Small": {"params": 25e6, "base_memory_gb": 5}, | |
"PhysicsNeMo-SFNO-Medium": {"params": 100e6, "base_memory_gb": 15}, | |
"PhysicsNeMo-SFNO-Large": {"params": 500e6, "base_memory_gb": 35}, | |
} | |
# H100 specifications | |
H100_MEMORY_GB = 80 # Memory per GPU | |
H100_GPUS_PER_NODE = 8 # GPUs per node | |
H100_NODE_MEMORY_GB = H100_MEMORY_GB * H100_GPUS_PER_NODE # 640GB per node | |
H100_COMPUTE_CAPABILITY = "9.0" | |
# CUDA version recommendations based on model and use case | |
CUDA_RECOMMENDATIONS = { | |
"inference": { | |
"recommended": "12.1+", | |
"minimum": "11.8", | |
"optimal": "12.4" | |
}, | |
"training": { | |
"recommended": "12.1+", | |
"minimum": "11.8", | |
"optimal": "12.4" | |
}, | |
"fine_tuning": { | |
"recommended": "12.1+", | |
"minimum": "11.8", | |
"optimal": "12.4" | |
} | |
} | |
def calculate_kv_cache_memory(num_tokens: int, model_params: float, num_layers: int = None) -> float: | |
"""Calculate KV cache memory requirements in GB""" | |
if num_layers is None: | |
# Estimate layers based on model size | |
if model_params < 1e9: | |
num_layers = 24 | |
elif model_params < 10e9: | |
num_layers = 32 | |
elif model_params < 100e9: | |
num_layers = 80 | |
else: | |
num_layers = 96 | |
# KV cache memory per token (approximate) | |
# 2 (K + V) * 2 (fp16) * hidden_dim * num_layers | |
hidden_dim = int((model_params / (num_layers * 4)) ** 0.5) * 64 # Rough estimate | |
kv_memory_per_token = 2 * 2 * hidden_dim * num_layers / (1024**3) # GB | |
return num_tokens * kv_memory_per_token | |
def estimate_h100_nodes( | |
model_name: str, | |
input_tokens: int, | |
output_tokens: int, | |
batch_size: int, | |
use_case: str, | |
precision: str | |
) -> Tuple[int, str, Dict]: | |
""" | |
Estimate the number of H100 nodes required | |
Returns: | |
- Number of nodes required | |
- Detailed explanation | |
- Dictionary with breakdown | |
""" | |
if model_name not in MODEL_SPECS: | |
return 1, f"Model {model_name} not found in specifications", {} | |
model_spec = MODEL_SPECS[model_name] | |
base_memory = model_spec["base_memory_gb"] | |
# Adjust memory based on precision | |
precision_multiplier = { | |
"FP32": 1.0, | |
"FP16": 0.5, | |
"BF16": 0.5, | |
"INT8": 0.25, | |
"INT4": 0.125 | |
} | |
model_memory = base_memory * precision_multiplier.get(precision, 0.5) | |
# Calculate KV cache memory | |
total_tokens = input_tokens + output_tokens | |
kv_cache_memory = calculate_kv_cache_memory(total_tokens, model_spec["params"]) * batch_size | |
# Use case specific memory overhead | |
overhead_multiplier = { | |
"inference": 1.2, # 20% overhead | |
"training": 3.0, # 3x for gradients, optimizer states | |
"fine_tuning": 2.5 # 2.5x for fine-tuning | |
} | |
total_memory_per_instance = (model_memory + kv_cache_memory) * overhead_multiplier.get(use_case, 1.2) | |
# Calculate nodes needed | |
memory_per_node = H100_NODE_MEMORY_GB * 0.9 # Reserve 10% for system (576GB usable per node) | |
nodes_needed = max(1, int(np.ceil(total_memory_per_instance / memory_per_node))) | |
# For very large models, consider model parallelism | |
if model_memory > memory_per_node: | |
min_nodes_for_model = int(np.ceil(model_memory / memory_per_node)) | |
nodes_needed = max(nodes_needed, min_nodes_for_model) | |
# Generate explanation | |
explanation = f""" | |
**Estimation Breakdown:** | |
• **Model**: {model_name} ({model_spec['params']/1e9:.1f}B parameters) | |
• **Precision**: {precision} | |
• **Model Memory**: {model_memory:.1f} GB | |
• **KV Cache Memory**: {kv_cache_memory:.1f} GB (for {total_tokens:,} tokens × {batch_size} batch size) | |
• **Use Case Overhead**: {overhead_multiplier.get(use_case, 1.2):.1f}x ({use_case}) | |
• **Total Memory Required**: {total_memory_per_instance:.1f} GB | |
• **H100 Node Specs**: {H100_GPUS_PER_NODE} × {H100_MEMORY_GB}GB = {H100_NODE_MEMORY_GB}GB per node | |
• **Usable Memory**: {memory_per_node:.1f} GB per node (10% reserved) | |
**Recommendation**: {nodes_needed} H100 node(s) ({nodes_needed * H100_GPUS_PER_NODE} H100 GPUs total) | |
""" | |
breakdown = { | |
"model_memory_gb": model_memory, | |
"kv_cache_memory_gb": kv_cache_memory, | |
"total_memory_gb": total_memory_per_instance, | |
"h100_memory_per_node_gb": memory_per_node, | |
"nodes_required": nodes_needed | |
} | |
return nodes_needed, explanation, breakdown | |
def get_cuda_recommendation(use_case: str) -> str: | |
"""Get CUDA version recommendation based on use case""" | |
cuda_info = CUDA_RECOMMENDATIONS.get(use_case, CUDA_RECOMMENDATIONS["inference"]) | |
recommendation = f""" | |
**CUDA Version Recommendations for {use_case.title()}:** | |
• **Optimal**: CUDA {cuda_info['optimal']} + cuDNN 8.9+ | |
• **Recommended**: CUDA {cuda_info['recommended']} + cuDNN 8.7+ | |
• **Minimum**: CUDA {cuda_info['minimum']} + cuDNN 8.5+ | |
**Additional Requirements:** | |
• **Driver Version**: 525.60.13+ (Linux) / 527.41+ (Windows) | |
• **Compute Capability**: {H100_COMPUTE_CAPABILITY} (H100 native) | |
• **Node Configuration**: {H100_GPUS_PER_NODE} × H100 GPUs per node ({H100_NODE_MEMORY_GB}GB total) | |
• **Memory**: ECC enabled recommended for production | |
""" | |
return recommendation | |
def create_performance_chart(breakdown: Dict) -> plt.Figure: | |
"""Create a memory utilization chart""" | |
if not breakdown: | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
ax.text(0.5, 0.5, 'No data to display', ha='center', va='center') | |
ax.set_xlim(0, 1) | |
ax.set_ylim(0, 1) | |
return fig | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) | |
# Memory breakdown pie chart | |
labels = ['Model Memory', 'KV Cache', 'Overhead'] | |
model_mem = breakdown['model_memory_gb'] | |
kv_mem = breakdown['kv_cache_memory_gb'] | |
overhead_mem = breakdown['total_memory_gb'] - model_mem - kv_mem | |
sizes = [model_mem, kv_mem, overhead_mem] | |
colors = ['#ff9999', '#66b3ff', '#99ff99'] | |
ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90) | |
ax1.set_title('Memory Breakdown') | |
# Node utilization bar chart | |
nodes = breakdown['nodes_required'] | |
total_memory = breakdown['total_memory_gb'] | |
memory_per_node = breakdown['h100_memory_per_node_gb'] | |
node_labels = [f'Node {i+1}' for i in range(nodes)] | |
utilization = [] | |
for i in range(nodes): | |
if i < nodes - 1: | |
utilization.append(memory_per_node) | |
else: | |
remaining_memory = total_memory - (nodes - 1) * memory_per_node | |
utilization.append(remaining_memory) | |
utilization_pct = [u / memory_per_node * 100 for u in utilization] | |
bars = ax2.bar(node_labels, utilization_pct, color='skyblue', alpha=0.7) | |
ax2.axhline(y=100, color='red', linestyle='--', alpha=0.7, label='Max Capacity') | |
ax2.set_ylabel('Memory Utilization (%)') | |
ax2.set_title('H100 Node Memory Utilization') | |
ax2.set_ylim(0, 110) | |
ax2.legend() | |
# Add value labels on bars | |
for bar, pct in zip(bars, utilization_pct): | |
ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, | |
f'{pct:.1f}%', ha='center', va='bottom') | |
plt.tight_layout() | |
return fig | |
def estimate_nodes_interface( | |
model_name: str, | |
input_tokens: int, | |
output_tokens: int, | |
batch_size: int, | |
use_case: str, | |
precision: str | |
): | |
"""Main interface function""" | |
# Validate inputs | |
if input_tokens <= 0 or output_tokens <= 0: | |
return "Please enter valid token counts (> 0)", "", None, "## ⚠️ <span style='color: #E74C3C;'>**Invalid Input: Token counts must be > 0**</span>" | |
if batch_size <= 0: | |
return "Please enter a valid batch size (> 0)", "", None, "## ⚠️ <span style='color: #E74C3C;'>**Invalid Input: Batch size must be > 0**</span>" | |
# Calculate node requirements | |
nodes_needed, explanation, breakdown = estimate_h100_nodes( | |
model_name, input_tokens, output_tokens, batch_size, use_case, precision | |
) | |
# Get CUDA recommendations | |
cuda_rec = get_cuda_recommendation(use_case) | |
# Create performance chart | |
fig = create_performance_chart(breakdown) | |
return explanation, cuda_rec, fig, f"## 🖥️ <span style='color: #4A90E2;'>**Estimated H100 Nodes Required: {nodes_needed}**</span>" | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks(title="H100 Node Estimator", theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# 🚀 H100 Node & CUDA Version Estimator") | |
gr.Markdown("Get recommendations for H100 node count and CUDA version based on your model and workload requirements.") | |
gr.Markdown("**Comprehensive Model Support**: LLaMA, Nemotron, Qwen2/2.5, Qwen-VL, VILA, Qwen-Audio, and **NVIDIA PhysicsNeMo** series!") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("## Input Parameters") | |
model_dropdown = gr.Dropdown( | |
choices=list(MODEL_SPECS.keys()), | |
value="LLaMA-3-8B", | |
label="Model", | |
info="Select the model you want to run (includes LLMs, multimodal, and physics-ML models)" | |
) | |
input_tokens = gr.Number( | |
value=2048, | |
label="Input Tokens", | |
info="Number of input tokens per request" | |
) | |
output_tokens = gr.Number( | |
value=512, | |
label="Output Tokens", | |
info="Number of output tokens per request" | |
) | |
batch_size = gr.Number( | |
value=1, | |
label="Batch Size", | |
info="Number of concurrent requests" | |
) | |
use_case = gr.Dropdown( | |
choices=["inference", "training", "fine_tuning"], | |
value="inference", | |
label="Use Case", | |
info="What will you use the model for?" | |
) | |
precision = gr.Dropdown( | |
choices=["FP32", "FP16", "BF16", "INT8", "INT4"], | |
value="FP16", | |
label="Precision", | |
info="Model precision/quantization" | |
) | |
estimate_btn = gr.Button("💡 Estimate Requirements", variant="primary") | |
with gr.Column(scale=2): | |
gr.Markdown("## Results") | |
node_count = gr.Markdown("## 🖥️ <span style='color: #4A90E2;'>**Ready to estimate...**</span>") | |
with gr.Tab("📊 Detailed Analysis"): | |
detailed_output = gr.Markdown() | |
with gr.Tab("🔧 CUDA Recommendations"): | |
cuda_output = gr.Markdown() | |
with gr.Tab("📈 Memory Utilization"): | |
chart_output = gr.Plot() | |
# Connect the interface | |
estimate_btn.click( | |
fn=estimate_nodes_interface, | |
inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision], | |
outputs=[detailed_output, cuda_output, chart_output, node_count] | |
) | |
# Add examples | |
gr.Markdown("## 💡 Example Scenarios") | |
examples = [ | |
["LLaMA-3-8B", 2048, 512, 1, "inference", "FP16"], | |
["LLaMA-3-70B", 4096, 1024, 4, "inference", "FP16"], | |
["Qwen2.5-72B", 8192, 2048, 2, "fine_tuning", "BF16"], | |
["Nemotron-4-340B", 2048, 1024, 1, "inference", "INT8"], | |
["Qwen2-VL-7B", 1024, 256, 1, "inference", "FP16"], | |
["VILA-1.5-13B", 2048, 512, 2, "inference", "BF16"], | |
["Qwen2-Audio-7B", 1024, 256, 1, "inference", "FP16"], | |
["PhysicsNeMo-FNO-Large", 512, 128, 8, "training", "FP32"], | |
["PhysicsNeMo-GraphCast-Medium", 1024, 256, 4, "training", "FP16"], | |
] | |
gr.Examples( | |
examples=examples, | |
inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision], | |
outputs=[detailed_output, cuda_output, chart_output, node_count], | |
fn=estimate_nodes_interface, | |
cache_examples=False | |
) | |
gr.Markdown(""" | |
## ℹ️ Notes | |
- **Multimodal Models**: Vision-language and audio models may require additional memory for image/audio processing | |
- **PhysicsNeMo Models**: Physics-ML models (FNO, PINN, GraphCast, SFNO) typically require higher batch sizes for training | |
- **Token Estimation**: For multimodal models, consider image patches (~256-1024 tokens per image) and audio frames | |
- **Physics Simulations**: PhysicsNeMo models often work with spatial/temporal grids rather than tokens | |
- Estimates are approximate and may vary based on actual implementation details | |
- Memory calculations include model weights, KV cache, and operational overhead | |
- Consider network bandwidth and storage requirements for multi-node setups | |
- For production deployments, add 10-20% buffer for optimal performance | |
""") | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch(share=True, server_name="0.0.0.0", server_port=7860) |