# Generated by ChatGPT — LLM VRAM & Time Estimator (Gradio)
# pip install torch transformers accelerate gradio pandas
import math, json
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple, Optional
import pandas as pd
import gradio as gr

PRECISION_BYTES = {"fp32":4.0, "fp16":2.0, "bf16":2.0, "int8":1.0}

@dataclass
class Inputs:
    hf_model_id: str; training_mode: str; precision: str; lora_rank: int; lora_alpha: int; lora_targets: str
    world_size: int; seq_len: int; micro_batch_size: int; grad_accum: int; optimizer: str; keep_master_fp32: bool
    activation_checkpointing: bool; activation_ckpt_factor: float; activation_multiplier: float
    zero_stage: str; fsdp: bool; fsdp_auto_wrap: bool; cpu_offload: bool
    reference_model: bool; reference_precision: str; rl_grpo: bool; rollout_batch_size: int; num_rollouts_per_step: int
    reward_model_params_b: float; verifier_overhead_gb: float; temp_allgather_overhead: float; safety_headroom: float
    tokens_to_train_b: float; throughput_toks_per_sec_per_gpu: float; cost_per_gpu_hour: float

@dataclass
class ComponentBreakdown:
    model_weights_gb: float=0.0; gradients_gb: float=0.0; optimizer_states_gb: float=0.0; activations_gb: float=0.0
    reference_model_gb: float=0.0; reward_model_gb: float=0.0; verifier_overhead_gb: float=0.0; temp_overheads_gb: float=0.0
    def total(self)->float: return sum(asdict(self).values())

def human_gb(x: float)->float: return round(x,3)

def count_model_params_meta(hf_model_id: str)->Tuple[int, Dict[str,int], Dict[str,Tuple[int,...]]]:
    import torch, torch.nn as nn
    from transformers import AutoConfig, AutoModelForCausalLM
    try:
        from accelerate import init_empty_weights
    except Exception:
        from contextlib import contextmanager
        @contextmanager
        def init_empty_weights(): yield
    cfg = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=True)
    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(cfg, trust_remote_code=True)
    linear_param_counts, linear_shapes = {}, {}
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and getattr(module, "weight", None) is not None:
            shape = tuple(module.weight.shape); pcount=1
            for d in shape: pcount*=int(d)
            if getattr(module, "bias", None) is not None: pcount+=int(module.bias.numel())
            linear_param_counts[name]=pcount; linear_shapes[name]=shape
    t=0
    for p in model.parameters():
        if p is not None: t+=p.numel()
    return int(t), linear_param_counts, linear_shapes

def lora_params_for_targets(linear_shapes: Dict[str,Tuple[int,...]], targets: List[str], rank: int)->Dict[str,int]:
    per_layer={}
    for name, shape in linear_shapes.items():
        if not any(t in name for t in targets): continue
        if len(shape)!=2: continue
        out_features, in_features = int(shape[0]), int(shape[1])
        per_layer[name] = in_features*rank + rank*out_features
    return per_layer

def apply_parallelism_sharding(cb: ComponentBreakdown, world_size:int, zero_stage:str, fsdp:bool, temp_overhead_pct:float)->ComponentBreakdown:
    c = ComponentBreakdown(**asdict(cb))
    if zero_stage=="zero1":
        c.optimizer_states_gb/=max(1,world_size)
    elif zero_stage=="zero2":
        c.optimizer_states_gb/=max(1,world_size); c.gradients_gb/=max(1,world_size)
    elif zero_stage=="zero3":
        c.optimizer_states_gb/=max(1,world_size); c.gradients_gb/=max(1,world_size); c.model_weights_gb/=max(1,world_size)
    if fsdp:
        c.optimizer_states_gb/=max(1,world_size); c.gradients_gb/=max(1,world_size); c.model_weights_gb/=max(1,world_size)
    gathered=0.0
    if zero_stage=="zero3" or fsdp:
        gathered = temp_overhead_pct * (c.model_weights_gb + c.gradients_gb + c.optimizer_states_gb)
    c.temp_overheads_gb += gathered
    return c

def estimate_components(inp: Inputs, total_params:int, lora_param_map: Optional[Dict[str,int]]=None)->ComponentBreakdown:
    bytes_w = PRECISION_BYTES["fp32"] if inp.training_mode=="fp32" else PRECISION_BYTES[inp.precision]
    bytes_g = PRECISION_BYTES[inp.precision]
    if inp.optimizer=="adamw32": opt_bytes=8.0
    elif inp.optimizer=="adamw8": opt_bytes=2.0
    else: opt_bytes=4.0
    master_bytes = 4.0 if (inp.keep_master_fp32 and inp.precision in ["fp16","bf16"]) else 0.0
    cb = ComponentBreakdown()
    if inp.training_mode=="lora":
        base_weight_gb = (total_params * PRECISION_BYTES[inp.precision])/(1024**3)
        cb.model_weights_gb += base_weight_gb
        lora_total = sum(lora_param_map.values()) if lora_param_map else 0
        cb.gradients_gb += (lora_total*bytes_g)/(1024**3)
        cb.optimizer_states_gb += (lora_total*opt_bytes)/(1024**3)
        if master_bytes>0: cb.optimizer_states_gb += (lora_total*master_bytes)/(1024**3)
        cb.model_weights_gb += (lora_total*bytes_w)/(1024**3)
    else:
        cb.model_weights_gb += (total_params*bytes_w)/(1024**3)
        cb.gradients_gb += (total_params*bytes_g)/(1024**3)
        cb.optimizer_states_gb += (total_params*opt_bytes)/(1024**3)
        if master_bytes>0: cb.optimizer_states_gb += (total_params*master_bytes)/(1024**3)
    inferred_layers=32
    inferred_d=int(max(512, math.sqrt(max(1,total_params)/(12.0*inferred_layers))))
    tokens_per_micro = inp.micro_batch_size*inp.seq_len
    act_elems = tokens_per_micro*inferred_d*inferred_layers
    act_bytes = act_elems*PRECISION_BYTES[inp.precision]*inp.activation_multiplier
    if inp.activation_checkpointing: act_bytes *= inp.activation_ckpt_factor
    act_bytes *= min(1.0 + 0.1*max(0,inp.grad_accum-1), 2.0)
    cb.activations_gb = act_bytes/(1024**3)
    if inp.rl_grpo:
        if inp.reference_model:
            ref_bytes = PRECISION_BYTES[inp.reference_precision]
            cb.reference_model_gb += (total_params*ref_bytes)/(1024**3)
        if inp.reward_model_params_b>0:
            rm_bytes = PRECISION_BYTES[inp.reference_precision]
            rm_params = int(inp.reward_model_params_b*1e9)
            cb.reward_model_gb += (rm_params*rm_bytes)/(1024**3)
    return cb

def finalize_sharded_and_vram(cb: ComponentBreakdown, world_size:int, zero_stage:str, fsdp:bool, temp_overhead_pct:float, headroom_pct:float)->Dict[str,float]:
    sharded = apply_parallelism_sharding(cb, world_size, zero_stage, fsdp, temp_overhead_pct)
    total_gb = sharded.total()
    total_with_headroom = total_gb*(1.0+headroom_pct)
    per_gpu_gb = total_with_headroom/max(1,world_size)
    out = {"total_vram_gb": round(total_with_headroom,3), "per_gpu_vram_gb": round(per_gpu_gb,3), "world_size": world_size, "headroom_pct": headroom_pct}
    out.update({k: round(v,3) for k,v in asdict(sharded).items()})
    return out

def estimate_time_and_cost(inp: Inputs)->Dict[str,float]:
    global_batch = max(1,inp.micro_batch_size)*max(1,inp.grad_accum)*max(1,inp.world_size)
    toks_per_step = global_batch*max(1,inp.seq_len)
    total_tokens = max(1.0, inp.tokens_to_train_b)*1e9
    steps = total_tokens/max(1,toks_per_step)
    eff_tps = inp.throughput_toks_per_sec_per_gpu*max(1,inp.world_size)*0.8
    seconds = total_tokens/max(1,eff_tps); hours = seconds/3600.0
    gpu_hours = hours*max(1,inp.world_size); cost = gpu_hours*max(0.0,inp.cost_per_gpu_hour)
    return {"tokens_total": total_tokens, "toks_per_step": toks_per_step, "steps_estimate": math.ceil(steps),
            "time_hours": round(hours,2), "gpu_hours": round(gpu_hours,2), "cost_usd": round(cost,2)}

def run_estimate(hf_model_id, training_mode, precision, lora_rank, lora_alpha, lora_targets,
    world_size, seq_len, micro_batch_size, grad_accum, optimizer, keep_master_fp32,
    activation_checkpointing, activation_ckpt_factor, activation_multiplier,
    zero_stage, fsdp, fsdp_auto_wrap, cpu_offload, reference_model, reference_precision,
    rl_grpo, rollout_batch_size, num_rollouts_per_step, reward_model_params_b, verifier_overhead_gb,
    temp_allgather_overhead, safety_headroom, tokens_to_train_b, throughput_toks_per_sec_per_gpu, cost_per_gpu_hour,
    do_per_layer_lora):
    inp = Inputs(hf_model_id.strip(), training_mode, precision, int(lora_rank), int(lora_alpha), lora_targets.strip(),
                 int(world_size), int(seq_len), int(micro_batch_size), int(grad_accum), optimizer, bool(keep_master_fp32),
                 bool(activation_checkpointing), float(activation_ckpt_factor), float(activation_multiplier),
                 zero_stage, bool(fsdp), bool(fsdp_auto_wrap), bool(cpu_offload), bool(reference_model), reference_precision,
                 bool(rl_grpo), int(rollout_batch_size), int(num_rollouts_per_step), float(reward_model_params_b),
                 float(verifier_overhead_gb), float(temp_allgather_overhead), float(safety_headroom),
                 float(tokens_to_train_b), float(throughput_toks_per_sec_per_gpu), float(cost_per_gpu_hour))
    total_params=0; linear_shapes={}; linear_counts={}; errors=[]
    try:
        total_params, linear_counts, linear_shapes = count_model_params_meta(inp.hf_model_id) if inp.hf_model_id else (0,{},{})
    except Exception as e:
        errors.append(f"Model introspection failed: {e}")
    if total_params==0: total_params=int(7e9)
    lora_param_map=None
    import pandas as pd
    if inp.training_mode=="lora":
        targets=[t.strip() for t in inp.lora_targets.split(",") if t.strip()]
        try:
            lora_param_map = lora_params_for_targets(linear_shapes, targets, inp.lora_rank) if targets else {}
        except Exception as e:
            errors.append(f"LoRA per-layer mapping failed: {e}"); lora_param_map={}
        if do_per_layer_lora and lora_param_map:
            rows=[{"layer":n, "lora_params":p, "lora_params_millions":round(p/1e6,3),
                   "memory_fp16_MB": round((p*PRECISION_BYTES[inp.precision])/(1024**2),3)} for n,p in sorted(lora_param_map.items(), key=lambda x:-x[1])]
            df=pd.DataFrame(rows)
        else:
            df=pd.DataFrame([{"info":"Enable 'Show per-layer LoRA table' and provide target modules."}])
    else:
        df=pd.DataFrame([{"info":"Per-layer LoRA table only applies when Training Mode = LoRA."}])
    cb = estimate_components(inp, total_params, lora_param_map=lora_param_map)
    if inp.rl_grpo: cb.activations_gb *= 1.2
    vram = finalize_sharded_and_vram(cb, inp.world_size, inp.zero_stage, inp.fsdp, inp.temp_allgather_overhead, inp.safety_headroom)
    tc = estimate_time_and_cost(inp)
    b_rows=[{"component":k.replace("_gb",""), "GB": vram.get(k,0.0)} for k in ["model_weights_gb","gradients_gb","optimizer_states_gb","activations_gb","reference_model_gb","reward_model_gb","verifier_overhead_gb","temp_overheads_gb"]]
    breakdown_df=pd.DataFrame(b_rows)
    headline=f"Total VRAM (with headroom): {vram['total_vram_gb']} GB | Per-GPU: {vram['per_gpu_vram_gb']} GB | World Size: {vram['world_size']}"
    runtime=f"Estimated training time: {tc['time_hours']} hours ({tc['steps_estimate']} steps). GPU hours: {tc['gpu_hours']}h. Cost: ${tc['cost_usd']}."
    tips=("Tips: Increase ZeRO stage or enable FSDP to reduce optimizer/gradient/weight memory. "
          "Use activation checkpointing or reduce sequence length to curb activations. "
          "For LoRA, lowering rank or narrowing target modules cuts trainable params drastically. "
          "For RL (GRPO), consider smaller reference or reward models, or staggered rollout/eval.")
    return headline, breakdown_df, df, runtime, "\\n".join(errors) if errors else "OK", json.dumps(vram, indent=2), json.dumps(tc, indent=2), tips

def build_ui():
    with gr.Blocks(title="LLM VRAM & Time Estimator") as demo:
        gr.Markdown("# LLM VRAM & Time Estimator\\nPlan memory & runtime for full finetune, LoRA, or RL (GRPO).")
        with gr.Row():
            hf_model_id = gr.Textbox(label="Hugging Face Model ID (e.g., meta-llama/Llama-2-7b-hf)", value="meta-llama/Llama-2-7b-hf")
        with gr.Row():
            training_mode = gr.Radio(choices=["fp32", "fp16", "lora"], value="lora", label="Training Mode")
            precision = gr.Radio(choices=["fp32", "fp16"], value="fp16", label="Compute Precision (weights/grads)")
        with gr.Accordion("LoRA Settings", open=True):
            with gr.Row():
                lora_rank = gr.Slider(1, 256, value=16, step=1, label="LoRA Rank (r)")
                lora_alpha = gr.Slider(1, 256, value=16, step=1, label="LoRA Alpha (scaling)")
            lora_targets = gr.Textbox(label="Target module name filters (comma-separated)", value="q_proj,k_proj,v_proj,o_proj,up_proj,down_proj,gate_proj")
            do_per_layer_lora = gr.Checkbox(label="Show per-layer LoRA table", value=True)
        with gr.Accordion("Batch & Sequence", open=True):
            with gr.Row():
                seq_len = gr.Slider(128, 16384, value=4096, step=128, label="Max Sequence Length")
            with gr.Row():
                micro_batch_size = gr.Slider(1, 1024, value=4, step=1, label="Micro-batch Size (per GPU)")
                grad_accum = gr.Slider(1, 1024, value=8, step=1, label="Gradient Accumulation Steps")
        with gr.Accordion("Optimizer & Activations", open=False):
            with gr.Row():
                optimizer = gr.Dropdown(choices=[("AdamW (fp32 states)", "adamw32"), ("AdamW 8-bit (bnb)", "adamw8"), ("Adafactor", "adafactor")], value="adamw32", label="Optimizer")
                keep_master_fp32 = gr.Checkbox(label="Keep FP32 master weights (mixed precision)", value=True)
            with gr.Row():
                activation_checkpointing = gr.Checkbox(label="Activation Checkpointing", value=True)
                activation_ckpt_factor = gr.Slider(0.3, 1.0, value=0.5, step=0.05, label="Activation Checkpointing Reduction Factor")
                activation_multiplier = gr.Slider(0.5, 4.0, value=1.5, step=0.1, label="Activation Memory Multiplier (heuristic)")
        with gr.Accordion("Parallelism / Memory Strategies", open=True):
            with gr.Row():
                world_size = gr.Slider(1, 256, value=8, step=1, label="World Size (number of GPUs)")
                zero_stage = gr.Radio(choices=[("None", "none"), ("ZeRO-1", "zero1"), ("ZeRO-2", "zero2"), ("ZeRO-3", "zero3")], value="zero2", label="DeepSpeed ZeRO Stage")
            with gr.Row():
                fsdp = gr.Checkbox(label="Enable FSDP (Fully Sharded Data Parallel)", value=False)
                fsdp_auto_wrap = gr.Checkbox(label="FSDP Auto Wrap Policy", value=True)
                cpu_offload = gr.Checkbox(label="CPU Offload (optimizer/params)", value=False)
            temp_allgather_overhead = gr.Slider(0.0, 0.5, value=0.1, step=0.01, label="Temp AllGather Overhead (fraction)")
            safety_headroom = gr.Slider(0.0, 0.5, value=0.1, step=0.01, label="Safety Headroom (fraction)")
        with gr.Accordion("Reinforcement Learning (GRPO) Settings", open=False):
            rl_grpo = gr.Checkbox(label="Enable RL with verifiable rewards (GRPO-like)", value=False)
            with gr.Row():
                reference_model = gr.Checkbox(label="Load Reference Model (same as policy)", value=True)
                reference_precision = gr.Radio(choices=["fp16", "fp32"], value="fp16", label="Reference Model Precision")
            with gr.Row():
                rollout_batch_size = gr.Slider(1, 8192, value=256, step=1, label="Rollout Batch Size (per step)")
                num_rollouts_per_step = gr.Slider(1, 64, value=4, step=1, label="Number of Rollouts per Step")
            with gr.Row():
                reward_model_params_b = gr.Slider(0.0, 20.0, value=0.0, step=0.1, label="Reward Model Size (B params)")
                verifier_overhead_gb = gr.Slider(0.0, 40.0, value=0.0, step=0.5, label="Verifier / Extra Overheads (GB)")
        with gr.Accordion("Training Time & Cost", open=True):
            tokens_to_train_b = gr.Slider(0.1, 200.0, value=10.0, step=0.1, label="Total Tokens to Train (Billions)")
            throughput_toks_per_sec_per_gpu = gr.Slider(1000, 500000, value=20000, step=1000, label="Throughput (tokens/sec) per GPU")
            cost_per_gpu_hour = gr.Slider(0.0, 100.0, value=2.5, step=0.1, label="Cost per GPU-hour (USD)")
        estimate_btn = gr.Button("Estimate VRAM & Time", variant="primary")
        headline = gr.Markdown()
        with gr.Row():
            breakdown_table = gr.Dataframe(headers=["component", "GB"], label="VRAM Breakdown (after sharding + overhead)", interactive=False)
            lora_table = gr.Dataframe(headers=["layer", "lora_params", "lora_params_millions", "memory_fp16_MB"], label="Per-layer LoRA Params", interactive=False)
        runtime = gr.Markdown(); raw_vram = gr.JSON(label="Raw VRAM JSON"); raw_time = gr.JSON(label="Raw Time/Cost JSON"); status = gr.Markdown(); tips = gr.Markdown()
        estimate_btn.click(fn=run_estimate, inputs=[
            hf_model_id, training_mode, precision, lora_rank, lora_alpha, lora_targets, world_size, seq_len, micro_batch_size, grad_accum,
            optimizer, keep_master_fp32, activation_checkpointing, activation_ckpt_factor, activation_multiplier,
            zero_stage, fsdp, fsdp_auto_wrap, cpu_offload, reference_model, reference_precision, rl_grpo, rollout_batch_size, num_rollouts_per_step,
            reward_model_params_b, verifier_overhead_gb, temp_allgather_overhead, safety_headroom, tokens_to_train_b, throughput_toks_per_sec_per_gpu, cost_per_gpu_hour,
            do_per_layer_lora
        ], outputs=[headline, breakdown_table, lora_table, runtime, status, raw_vram, raw_time, tips])
    return demo

if __name__=="__main__":
    demo = build_ui()
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)