import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
from typing import Dict, Tuple, List

# Model specifications (approximate parameter counts and memory requirements)
MODEL_SPECS = {
    "LLaMA-2-7B": {"params": 7e9, "base_memory_gb": 14},
    "LLaMA-2-13B": {"params": 13e9, "base_memory_gb": 26},
    "LLaMA-2-70B": {"params": 70e9, "base_memory_gb": 140},
    "LLaMA-3-8B": {"params": 8e9, "base_memory_gb": 16},
    "LLaMA-3-70B": {"params": 70e9, "base_memory_gb": 140},
    "LLaMA-3.1-8B": {"params": 8e9, "base_memory_gb": 16},
    "LLaMA-3.1-70B": {"params": 70e9, "base_memory_gb": 140},
    "LLaMA-3.1-405B": {"params": 405e9, "base_memory_gb": 810},
    "Nemotron-4-340B": {"params": 340e9, "base_memory_gb": 680},
    "Nemotron-4-15B": {"params": 15e9, "base_memory_gb": 30},
    "Qwen2-0.5B": {"params": 0.5e9, "base_memory_gb": 1},
    "Qwen2-1.5B": {"params": 1.5e9, "base_memory_gb": 3},
    "Qwen2-7B": {"params": 7e9, "base_memory_gb": 14},
    "Qwen2-72B": {"params": 72e9, "base_memory_gb": 144},
    "Qwen2.5-0.5B": {"params": 0.5e9, "base_memory_gb": 1},
    "Qwen2.5-1.5B": {"params": 1.5e9, "base_memory_gb": 3},
    "Qwen2.5-7B": {"params": 7e9, "base_memory_gb": 14},
    "Qwen2.5-14B": {"params": 14e9, "base_memory_gb": 28},
    "Qwen2.5-32B": {"params": 32e9, "base_memory_gb": 64},
    "Qwen2.5-72B": {"params": 72e9, "base_memory_gb": 144},
    # Qwen Vision Language Models
    "Qwen-VL": {"params": 9.6e9, "base_memory_gb": 20},
    "Qwen-VL-Chat": {"params": 9.6e9, "base_memory_gb": 20},
    "Qwen-VL-Plus": {"params": 12e9, "base_memory_gb": 25},
    "Qwen-VL-Max": {"params": 30e9, "base_memory_gb": 65},
    "Qwen2-VL-2B": {"params": 2e9, "base_memory_gb": 5},
    "Qwen2-VL-7B": {"params": 8e9, "base_memory_gb": 18},
    "Qwen2-VL-72B": {"params": 72e9, "base_memory_gb": 150},
    # NVIDIA VILA Series
    "VILA-1.5-3B": {"params": 3e9, "base_memory_gb": 7},
    "VILA-1.5-8B": {"params": 8e9, "base_memory_gb": 18},
    "VILA-1.5-13B": {"params": 13e9, "base_memory_gb": 28},
    "VILA-1.5-40B": {"params": 40e9, "base_memory_gb": 85},
    # Qwen Audio Models
    "Qwen-Audio": {"params": 8e9, "base_memory_gb": 18},
    "Qwen-Audio-Chat": {"params": 8e9, "base_memory_gb": 18},
    "Qwen2-Audio-7B": {"params": 8e9, "base_memory_gb": 18},
    # NVIDIA PhysicsNeMo Models
    "PhysicsNeMo-FNO-Small": {"params": 1e6, "base_memory_gb": 0.5},
    "PhysicsNeMo-FNO-Medium": {"params": 10e6, "base_memory_gb": 2},
    "PhysicsNeMo-FNO-Large": {"params": 50e6, "base_memory_gb": 8},
    "PhysicsNeMo-PINN-Small": {"params": 0.5e6, "base_memory_gb": 0.2},
    "PhysicsNeMo-PINN-Medium": {"params": 5e6, "base_memory_gb": 1},
    "PhysicsNeMo-PINN-Large": {"params": 20e6, "base_memory_gb": 4},
    "PhysicsNeMo-GraphCast-Small": {"params": 50e6, "base_memory_gb": 8},
    "PhysicsNeMo-GraphCast-Medium": {"params": 200e6, "base_memory_gb": 20},
    "PhysicsNeMo-GraphCast-Large": {"params": 1e9, "base_memory_gb": 50},
    "PhysicsNeMo-SFNO-Small": {"params": 25e6, "base_memory_gb": 5},
    "PhysicsNeMo-SFNO-Medium": {"params": 100e6, "base_memory_gb": 15},
    "PhysicsNeMo-SFNO-Large": {"params": 500e6, "base_memory_gb": 35},
}

# H100 specifications
H100_MEMORY_GB = 80  # Memory per GPU
H100_GPUS_PER_NODE = 8  # GPUs per node
H100_NODE_MEMORY_GB = H100_MEMORY_GB * H100_GPUS_PER_NODE  # 640GB per node
H100_COMPUTE_CAPABILITY = "9.0"

# CUDA version recommendations based on model and use case
CUDA_RECOMMENDATIONS = {
    "inference": {
        "recommended": "12.1+",
        "minimum": "11.8",
        "optimal": "12.4"
    },
    "training": {
        "recommended": "12.1+", 
        "minimum": "11.8",
        "optimal": "12.4"
    },
    "fine_tuning": {
        "recommended": "12.1+",
        "minimum": "11.8", 
        "optimal": "12.4"
    }
}

def calculate_kv_cache_memory(num_tokens: int, model_params: float, num_layers: int = None) -> float:
    """Calculate KV cache memory requirements in GB"""
    if num_layers is None:
        # Estimate layers based on model size
        if model_params < 1e9:
            num_layers = 24
        elif model_params < 10e9:
            num_layers = 32
        elif model_params < 100e9:
            num_layers = 80
        else:
            num_layers = 96
    
    # KV cache memory per token (approximate)
    # 2 (K + V) * 2 (fp16) * hidden_dim * num_layers
    hidden_dim = int((model_params / (num_layers * 4)) ** 0.5) * 64  # Rough estimate
    kv_memory_per_token = 2 * 2 * hidden_dim * num_layers / (1024**3)  # GB
    
    return num_tokens * kv_memory_per_token

def estimate_h100_nodes(
    model_name: str,
    input_tokens: int,
    output_tokens: int,
    batch_size: int,
    use_case: str,
    precision: str
) -> Tuple[int, str, Dict]:
    """
    Estimate the number of H100 nodes required
    
    Returns:
        - Number of nodes required
        - Detailed explanation
        - Dictionary with breakdown
    """
    
    if model_name not in MODEL_SPECS:
        return 1, f"Model {model_name} not found in specifications", {}
    
    model_spec = MODEL_SPECS[model_name]
    base_memory = model_spec["base_memory_gb"]
    
    # Adjust memory based on precision
    precision_multiplier = {
        "FP32": 1.0,
        "FP16": 0.5,
        "BF16": 0.5,
        "INT8": 0.25,
        "INT4": 0.125
    }
    
    model_memory = base_memory * precision_multiplier.get(precision, 0.5)
    
    # Calculate KV cache memory
    total_tokens = input_tokens + output_tokens
    kv_cache_memory = calculate_kv_cache_memory(total_tokens, model_spec["params"]) * batch_size
    
    # Use case specific memory overhead
    overhead_multiplier = {
        "inference": 1.2,  # 20% overhead
        "training": 3.0,   # 3x for gradients, optimizer states
        "fine_tuning": 2.5 # 2.5x for fine-tuning
    }
    
    total_memory_per_instance = (model_memory + kv_cache_memory) * overhead_multiplier.get(use_case, 1.2)
    
    # Calculate nodes needed
    memory_per_node = H100_NODE_MEMORY_GB * 0.9  # Reserve 10% for system (576GB usable per node)
    nodes_needed = max(1, int(np.ceil(total_memory_per_instance / memory_per_node)))
    
    # For very large models, consider model parallelism
    if model_memory > memory_per_node:
        min_nodes_for_model = int(np.ceil(model_memory / memory_per_node))
        nodes_needed = max(nodes_needed, min_nodes_for_model)
    
    # Generate explanation
    explanation = f"""
    **Estimation Breakdown:**
    
    • **Model**: {model_name} ({model_spec['params']/1e9:.1f}B parameters)
    • **Precision**: {precision}
    • **Model Memory**: {model_memory:.1f} GB
    • **KV Cache Memory**: {kv_cache_memory:.1f} GB (for {total_tokens:,} tokens × {batch_size} batch size)
    • **Use Case Overhead**: {overhead_multiplier.get(use_case, 1.2):.1f}x ({use_case})
    • **Total Memory Required**: {total_memory_per_instance:.1f} GB
    • **H100 Node Specs**: {H100_GPUS_PER_NODE} × {H100_MEMORY_GB}GB = {H100_NODE_MEMORY_GB}GB per node
    • **Usable Memory**: {memory_per_node:.1f} GB per node (10% reserved)
    
    **Recommendation**: {nodes_needed} H100 node(s) ({nodes_needed * H100_GPUS_PER_NODE} H100 GPUs total)
    """
    
    breakdown = {
        "model_memory_gb": model_memory,
        "kv_cache_memory_gb": kv_cache_memory,
        "total_memory_gb": total_memory_per_instance,
        "h100_memory_per_node_gb": memory_per_node,
        "nodes_required": nodes_needed
    }
    
    return nodes_needed, explanation, breakdown

def get_cuda_recommendation(use_case: str) -> str:
    """Get CUDA version recommendation based on use case"""
    cuda_info = CUDA_RECOMMENDATIONS.get(use_case, CUDA_RECOMMENDATIONS["inference"])
    
    recommendation = f"""
    **CUDA Version Recommendations for {use_case.title()}:**
    
    • **Optimal**: CUDA {cuda_info['optimal']} + cuDNN 8.9+
    • **Recommended**: CUDA {cuda_info['recommended']} + cuDNN 8.7+
    • **Minimum**: CUDA {cuda_info['minimum']} + cuDNN 8.5+
    
    **Additional Requirements:**
    • **Driver Version**: 525.60.13+ (Linux) / 527.41+ (Windows)
    • **Compute Capability**: {H100_COMPUTE_CAPABILITY} (H100 native)
    • **Node Configuration**: {H100_GPUS_PER_NODE} × H100 GPUs per node ({H100_NODE_MEMORY_GB}GB total)
    • **Memory**: ECC enabled recommended for production
    """
    
    return recommendation

def create_performance_chart(breakdown: Dict) -> plt.Figure:
    """Create a memory utilization chart"""
    if not breakdown:
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.text(0.5, 0.5, 'No data to display', ha='center', va='center')
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        return fig
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Memory breakdown pie chart
    labels = ['Model Memory', 'KV Cache', 'Overhead']
    model_mem = breakdown['model_memory_gb']
    kv_mem = breakdown['kv_cache_memory_gb']
    overhead_mem = breakdown['total_memory_gb'] - model_mem - kv_mem
    sizes = [model_mem, kv_mem, overhead_mem]
    
    colors = ['#ff9999', '#66b3ff', '#99ff99']
    ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
    ax1.set_title('Memory Breakdown')
    
    # Node utilization bar chart
    nodes = breakdown['nodes_required']
    total_memory = breakdown['total_memory_gb']
    memory_per_node = breakdown['h100_memory_per_node_gb']
    
    node_labels = [f'Node {i+1}' for i in range(nodes)]
    utilization = []
    
    for i in range(nodes):
        if i < nodes - 1:
            utilization.append(memory_per_node)
        else:
            remaining_memory = total_memory - (nodes - 1) * memory_per_node
            utilization.append(remaining_memory)
    
    utilization_pct = [u / memory_per_node * 100 for u in utilization]
    
    bars = ax2.bar(node_labels, utilization_pct, color='skyblue', alpha=0.7)
    ax2.axhline(y=100, color='red', linestyle='--', alpha=0.7, label='Max Capacity')
    ax2.set_ylabel('Memory Utilization (%)')
    ax2.set_title('H100 Node Memory Utilization')
    ax2.set_ylim(0, 110)
    ax2.legend()
    
    # Add value labels on bars
    for bar, pct in zip(bars, utilization_pct):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{pct:.1f}%', ha='center', va='bottom')
    
    plt.tight_layout()
    return fig

def estimate_nodes_interface(
    model_name: str,
    input_tokens: int,
    output_tokens: int,
    batch_size: int,
    use_case: str,
    precision: str
):
    """Main interface function"""
    
    # Validate inputs
    if input_tokens <= 0 or output_tokens <= 0:
        return "Please enter valid token counts (> 0)", "", None, "## ⚠️ <span style='color: #E74C3C;'>**Invalid Input: Token counts must be > 0**</span>"
    
    if batch_size <= 0:
        return "Please enter a valid batch size (> 0)", "", None, "## ⚠️ <span style='color: #E74C3C;'>**Invalid Input: Batch size must be > 0**</span>"
    
    # Calculate node requirements
    nodes_needed, explanation, breakdown = estimate_h100_nodes(
        model_name, input_tokens, output_tokens, batch_size, use_case, precision
    )
    
    # Get CUDA recommendations
    cuda_rec = get_cuda_recommendation(use_case)
    
    # Create performance chart
    fig = create_performance_chart(breakdown)
    
    return explanation, cuda_rec, fig, f"## 🖥️ <span style='color: #4A90E2;'>**Estimated H100 Nodes Required: {nodes_needed}**</span>"

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="H100 Node Estimator", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 🚀 H100 Node & CUDA Version Estimator")
        gr.Markdown("Get recommendations for H100 node count and CUDA version based on your model and workload requirements.")
        gr.Markdown("**Comprehensive Model Support**: LLaMA, Nemotron, Qwen2/2.5, Qwen-VL, VILA, Qwen-Audio, and **NVIDIA PhysicsNeMo** series!")
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("## Input Parameters")
                
                model_dropdown = gr.Dropdown(
                    choices=list(MODEL_SPECS.keys()),
                    value="LLaMA-3-8B",
                    label="Model",
                    info="Select the model you want to run (includes LLMs, multimodal, and physics-ML models)"
                )
                
                input_tokens = gr.Number(
                    value=2048,
                    label="Input Tokens",
                    info="Number of input tokens per request"
                )
                
                output_tokens = gr.Number(
                    value=512,
                    label="Output Tokens", 
                    info="Number of output tokens per request"
                )
                
                batch_size = gr.Number(
                    value=1,
                    label="Batch Size",
                    info="Number of concurrent requests"
                )
                
                use_case = gr.Dropdown(
                    choices=["inference", "training", "fine_tuning"],
                    value="inference",
                    label="Use Case",
                    info="What will you use the model for?"
                )
                
                precision = gr.Dropdown(
                    choices=["FP32", "FP16", "BF16", "INT8", "INT4"],
                    value="FP16",
                    label="Precision",
                    info="Model precision/quantization"
                )
                
                estimate_btn = gr.Button("💡 Estimate Requirements", variant="primary")
            
            with gr.Column(scale=2):
                gr.Markdown("## Results")
                
                node_count = gr.Markdown("## 🖥️ <span style='color: #4A90E2;'>**Ready to estimate...**</span>")
                
                with gr.Tab("📊 Detailed Analysis"):
                    detailed_output = gr.Markdown()
                
                with gr.Tab("🔧 CUDA Recommendations"):
                    cuda_output = gr.Markdown()
                
                with gr.Tab("📈 Memory Utilization"):
                    chart_output = gr.Plot()
        
        # Connect the interface
        estimate_btn.click(
            fn=estimate_nodes_interface,
            inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision],
            outputs=[detailed_output, cuda_output, chart_output, node_count]
        )
        
        # Add examples
        gr.Markdown("## 💡 Example Scenarios")
        
        examples = [
            ["LLaMA-3-8B", 2048, 512, 1, "inference", "FP16"],
            ["LLaMA-3-70B", 4096, 1024, 4, "inference", "FP16"], 
            ["Qwen2.5-72B", 8192, 2048, 2, "fine_tuning", "BF16"],
            ["Nemotron-4-340B", 2048, 1024, 1, "inference", "INT8"],
            ["Qwen2-VL-7B", 1024, 256, 1, "inference", "FP16"],
            ["VILA-1.5-13B", 2048, 512, 2, "inference", "BF16"],
            ["Qwen2-Audio-7B", 1024, 256, 1, "inference", "FP16"],
            ["PhysicsNeMo-FNO-Large", 512, 128, 8, "training", "FP32"],
            ["PhysicsNeMo-GraphCast-Medium", 1024, 256, 4, "training", "FP16"],
        ]
        
        gr.Examples(
            examples=examples,
            inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision],
            outputs=[detailed_output, cuda_output, chart_output, node_count],
            fn=estimate_nodes_interface,
            cache_examples=False
        )
        
        gr.Markdown("""
        ## ℹ️ Notes
        - **Multimodal Models**: Vision-language and audio models may require additional memory for image/audio processing
        - **PhysicsNeMo Models**: Physics-ML models (FNO, PINN, GraphCast, SFNO) typically require higher batch sizes for training
        - **Token Estimation**: For multimodal models, consider image patches (~256-1024 tokens per image) and audio frames
        - **Physics Simulations**: PhysicsNeMo models often work with spatial/temporal grids rather than tokens
        - Estimates are approximate and may vary based on actual implementation details
        - Memory calculations include model weights, KV cache, and operational overhead
        - Consider network bandwidth and storage requirements for multi-node setups
        - For production deployments, add 10-20% buffer for optimal performance
        """)
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True, server_name="0.0.0.0", server_port=7860)