# Generated by ChatGPT — LLM VRAM & Time Estimator (Gradio) # pip install torch transformers accelerate gradio pandas import math, json from dataclasses import dataclass, asdict from typing import Dict, List, Tuple, Optional import pandas as pd import gradio as gr PRECISION_BYTES = {"fp32":4.0, "fp16":2.0, "bf16":2.0, "int8":1.0} @dataclass class Inputs: hf_model_id: str; training_mode: str; precision: str; lora_rank: int; lora_alpha: int; lora_targets: str world_size: int; seq_len: int; micro_batch_size: int; grad_accum: int; optimizer: str; keep_master_fp32: bool activation_checkpointing: bool; activation_ckpt_factor: float; activation_multiplier: float zero_stage: str; fsdp: bool; fsdp_auto_wrap: bool; cpu_offload: bool reference_model: bool; reference_precision: str; rl_grpo: bool; rollout_batch_size: int; num_rollouts_per_step: int reward_model_params_b: float; verifier_overhead_gb: float; temp_allgather_overhead: float; safety_headroom: float tokens_to_train_b: float; throughput_toks_per_sec_per_gpu: float; cost_per_gpu_hour: float @dataclass class ComponentBreakdown: model_weights_gb: float=0.0; gradients_gb: float=0.0; optimizer_states_gb: float=0.0; activations_gb: float=0.0 reference_model_gb: float=0.0; reward_model_gb: float=0.0; verifier_overhead_gb: float=0.0; temp_overheads_gb: float=0.0 def total(self)->float: return sum(asdict(self).values()) def human_gb(x: float)->float: return round(x,3) def count_model_params_meta(hf_model_id: str)->Tuple[int, Dict[str,int], Dict[str,Tuple[int,...]]]: import torch, torch.nn as nn from transformers import AutoConfig, AutoModelForCausalLM try: from accelerate import init_empty_weights except Exception: from contextlib import contextmanager @contextmanager def init_empty_weights(): yield cfg = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=True) with init_empty_weights(): model = AutoModelForCausalLM.from_config(cfg, trust_remote_code=True) linear_param_counts, linear_shapes = {}, {} for name, module in model.named_modules(): if isinstance(module, nn.Linear) and getattr(module, "weight", None) is not None: shape = tuple(module.weight.shape); pcount=1 for d in shape: pcount*=int(d) if getattr(module, "bias", None) is not None: pcount+=int(module.bias.numel()) linear_param_counts[name]=pcount; linear_shapes[name]=shape t=0 for p in model.parameters(): if p is not None: t+=p.numel() return int(t), linear_param_counts, linear_shapes def lora_params_for_targets(linear_shapes: Dict[str,Tuple[int,...]], targets: List[str], rank: int)->Dict[str,int]: per_layer={} for name, shape in linear_shapes.items(): if not any(t in name for t in targets): continue if len(shape)!=2: continue out_features, in_features = int(shape[0]), int(shape[1]) per_layer[name] = in_features*rank + rank*out_features return per_layer def apply_parallelism_sharding(cb: ComponentBreakdown, world_size:int, zero_stage:str, fsdp:bool, temp_overhead_pct:float)->ComponentBreakdown: c = ComponentBreakdown(**asdict(cb)) if zero_stage=="zero1": c.optimizer_states_gb/=max(1,world_size) elif zero_stage=="zero2": c.optimizer_states_gb/=max(1,world_size); c.gradients_gb/=max(1,world_size) elif zero_stage=="zero3": c.optimizer_states_gb/=max(1,world_size); c.gradients_gb/=max(1,world_size); c.model_weights_gb/=max(1,world_size) if fsdp: c.optimizer_states_gb/=max(1,world_size); c.gradients_gb/=max(1,world_size); c.model_weights_gb/=max(1,world_size) gathered=0.0 if zero_stage=="zero3" or fsdp: gathered = temp_overhead_pct * (c.model_weights_gb + c.gradients_gb + c.optimizer_states_gb) c.temp_overheads_gb += gathered return c def estimate_components(inp: Inputs, total_params:int, lora_param_map: Optional[Dict[str,int]]=None)->ComponentBreakdown: bytes_w = PRECISION_BYTES["fp32"] if inp.training_mode=="fp32" else PRECISION_BYTES[inp.precision] bytes_g = PRECISION_BYTES[inp.precision] if inp.optimizer=="adamw32": opt_bytes=8.0 elif inp.optimizer=="adamw8": opt_bytes=2.0 else: opt_bytes=4.0 master_bytes = 4.0 if (inp.keep_master_fp32 and inp.precision in ["fp16","bf16"]) else 0.0 cb = ComponentBreakdown() if inp.training_mode=="lora": base_weight_gb = (total_params * PRECISION_BYTES[inp.precision])/(1024**3) cb.model_weights_gb += base_weight_gb lora_total = sum(lora_param_map.values()) if lora_param_map else 0 cb.gradients_gb += (lora_total*bytes_g)/(1024**3) cb.optimizer_states_gb += (lora_total*opt_bytes)/(1024**3) if master_bytes>0: cb.optimizer_states_gb += (lora_total*master_bytes)/(1024**3) cb.model_weights_gb += (lora_total*bytes_w)/(1024**3) else: cb.model_weights_gb += (total_params*bytes_w)/(1024**3) cb.gradients_gb += (total_params*bytes_g)/(1024**3) cb.optimizer_states_gb += (total_params*opt_bytes)/(1024**3) if master_bytes>0: cb.optimizer_states_gb += (total_params*master_bytes)/(1024**3) inferred_layers=32 inferred_d=int(max(512, math.sqrt(max(1,total_params)/(12.0*inferred_layers)))) tokens_per_micro = inp.micro_batch_size*inp.seq_len act_elems = tokens_per_micro*inferred_d*inferred_layers act_bytes = act_elems*PRECISION_BYTES[inp.precision]*inp.activation_multiplier if inp.activation_checkpointing: act_bytes *= inp.activation_ckpt_factor act_bytes *= min(1.0 + 0.1*max(0,inp.grad_accum-1), 2.0) cb.activations_gb = act_bytes/(1024**3) if inp.rl_grpo: if inp.reference_model: ref_bytes = PRECISION_BYTES[inp.reference_precision] cb.reference_model_gb += (total_params*ref_bytes)/(1024**3) if inp.reward_model_params_b>0: rm_bytes = PRECISION_BYTES[inp.reference_precision] rm_params = int(inp.reward_model_params_b*1e9) cb.reward_model_gb += (rm_params*rm_bytes)/(1024**3) return cb def finalize_sharded_and_vram(cb: ComponentBreakdown, world_size:int, zero_stage:str, fsdp:bool, temp_overhead_pct:float, headroom_pct:float)->Dict[str,float]: sharded = apply_parallelism_sharding(cb, world_size, zero_stage, fsdp, temp_overhead_pct) total_gb = sharded.total() total_with_headroom = total_gb*(1.0+headroom_pct) per_gpu_gb = total_with_headroom/max(1,world_size) out = {"total_vram_gb": round(total_with_headroom,3), "per_gpu_vram_gb": round(per_gpu_gb,3), "world_size": world_size, "headroom_pct": headroom_pct} out.update({k: round(v,3) for k,v in asdict(sharded).items()}) return out def estimate_time_and_cost(inp: Inputs)->Dict[str,float]: global_batch = max(1,inp.micro_batch_size)*max(1,inp.grad_accum)*max(1,inp.world_size) toks_per_step = global_batch*max(1,inp.seq_len) total_tokens = max(1.0, inp.tokens_to_train_b)*1e9 steps = total_tokens/max(1,toks_per_step) eff_tps = inp.throughput_toks_per_sec_per_gpu*max(1,inp.world_size)*0.8 seconds = total_tokens/max(1,eff_tps); hours = seconds/3600.0 gpu_hours = hours*max(1,inp.world_size); cost = gpu_hours*max(0.0,inp.cost_per_gpu_hour) return {"tokens_total": total_tokens, "toks_per_step": toks_per_step, "steps_estimate": math.ceil(steps), "time_hours": round(hours,2), "gpu_hours": round(gpu_hours,2), "cost_usd": round(cost,2)} def run_estimate(hf_model_id, training_mode, precision, lora_rank, lora_alpha, lora_targets, world_size, seq_len, micro_batch_size, grad_accum, optimizer, keep_master_fp32, activation_checkpointing, activation_ckpt_factor, activation_multiplier, zero_stage, fsdp, fsdp_auto_wrap, cpu_offload, reference_model, reference_precision, rl_grpo, rollout_batch_size, num_rollouts_per_step, reward_model_params_b, verifier_overhead_gb, temp_allgather_overhead, safety_headroom, tokens_to_train_b, throughput_toks_per_sec_per_gpu, cost_per_gpu_hour, do_per_layer_lora): inp = Inputs(hf_model_id.strip(), training_mode, precision, int(lora_rank), int(lora_alpha), lora_targets.strip(), int(world_size), int(seq_len), int(micro_batch_size), int(grad_accum), optimizer, bool(keep_master_fp32), bool(activation_checkpointing), float(activation_ckpt_factor), float(activation_multiplier), zero_stage, bool(fsdp), bool(fsdp_auto_wrap), bool(cpu_offload), bool(reference_model), reference_precision, bool(rl_grpo), int(rollout_batch_size), int(num_rollouts_per_step), float(reward_model_params_b), float(verifier_overhead_gb), float(temp_allgather_overhead), float(safety_headroom), float(tokens_to_train_b), float(throughput_toks_per_sec_per_gpu), float(cost_per_gpu_hour)) total_params=0; linear_shapes={}; linear_counts={}; errors=[] try: total_params, linear_counts, linear_shapes = count_model_params_meta(inp.hf_model_id) if inp.hf_model_id else (0,{},{}) except Exception as e: errors.append(f"Model introspection failed: {e}") if total_params==0: total_params=int(7e9) lora_param_map=None import pandas as pd if inp.training_mode=="lora": targets=[t.strip() for t in inp.lora_targets.split(",") if t.strip()] try: lora_param_map = lora_params_for_targets(linear_shapes, targets, inp.lora_rank) if targets else {} except Exception as e: errors.append(f"LoRA per-layer mapping failed: {e}"); lora_param_map={} if do_per_layer_lora and lora_param_map: rows=[{"layer":n, "lora_params":p, "lora_params_millions":round(p/1e6,3), "memory_fp16_MB": round((p*PRECISION_BYTES[inp.precision])/(1024**2),3)} for n,p in sorted(lora_param_map.items(), key=lambda x:-x[1])] df=pd.DataFrame(rows) else: df=pd.DataFrame([{"info":"Enable 'Show per-layer LoRA table' and provide target modules."}]) else: df=pd.DataFrame([{"info":"Per-layer LoRA table only applies when Training Mode = LoRA."}]) cb = estimate_components(inp, total_params, lora_param_map=lora_param_map) if inp.rl_grpo: cb.activations_gb *= 1.2 vram = finalize_sharded_and_vram(cb, inp.world_size, inp.zero_stage, inp.fsdp, inp.temp_allgather_overhead, inp.safety_headroom) tc = estimate_time_and_cost(inp) b_rows=[{"component":k.replace("_gb",""), "GB": vram.get(k,0.0)} for k in ["model_weights_gb","gradients_gb","optimizer_states_gb","activations_gb","reference_model_gb","reward_model_gb","verifier_overhead_gb","temp_overheads_gb"]] breakdown_df=pd.DataFrame(b_rows) headline=f"Total VRAM (with headroom): {vram['total_vram_gb']} GB | Per-GPU: {vram['per_gpu_vram_gb']} GB | World Size: {vram['world_size']}" runtime=f"Estimated training time: {tc['time_hours']} hours ({tc['steps_estimate']} steps). GPU hours: {tc['gpu_hours']}h. Cost: ${tc['cost_usd']}." tips=("Tips: Increase ZeRO stage or enable FSDP to reduce optimizer/gradient/weight memory. " "Use activation checkpointing or reduce sequence length to curb activations. " "For LoRA, lowering rank or narrowing target modules cuts trainable params drastically. " "For RL (GRPO), consider smaller reference or reward models, or staggered rollout/eval.") return headline, breakdown_df, df, runtime, "\\n".join(errors) if errors else "OK", json.dumps(vram, indent=2), json.dumps(tc, indent=2), tips def build_ui(): with gr.Blocks(title="LLM VRAM & Time Estimator") as demo: gr.Markdown("# LLM VRAM & Time Estimator\\nPlan memory & runtime for full finetune, LoRA, or RL (GRPO).") with gr.Row(): hf_model_id = gr.Textbox(label="Hugging Face Model ID (e.g., meta-llama/Llama-2-7b-hf)", value="meta-llama/Llama-2-7b-hf") with gr.Row(): training_mode = gr.Radio(choices=["fp32", "fp16", "lora"], value="lora", label="Training Mode") precision = gr.Radio(choices=["fp32", "fp16"], value="fp16", label="Compute Precision (weights/grads)") with gr.Accordion("LoRA Settings", open=True): with gr.Row(): lora_rank = gr.Slider(1, 256, value=16, step=1, label="LoRA Rank (r)") lora_alpha = gr.Slider(1, 256, value=16, step=1, label="LoRA Alpha (scaling)") lora_targets = gr.Textbox(label="Target module name filters (comma-separated)", value="q_proj,k_proj,v_proj,o_proj,up_proj,down_proj,gate_proj") do_per_layer_lora = gr.Checkbox(label="Show per-layer LoRA table", value=True) with gr.Accordion("Batch & Sequence", open=True): with gr.Row(): seq_len = gr.Slider(128, 16384, value=4096, step=128, label="Max Sequence Length") with gr.Row(): micro_batch_size = gr.Slider(1, 1024, value=4, step=1, label="Micro-batch Size (per GPU)") grad_accum = gr.Slider(1, 1024, value=8, step=1, label="Gradient Accumulation Steps") with gr.Accordion("Optimizer & Activations", open=False): with gr.Row(): optimizer = gr.Dropdown(choices=[("AdamW (fp32 states)", "adamw32"), ("AdamW 8-bit (bnb)", "adamw8"), ("Adafactor", "adafactor")], value="adamw32", label="Optimizer") keep_master_fp32 = gr.Checkbox(label="Keep FP32 master weights (mixed precision)", value=True) with gr.Row(): activation_checkpointing = gr.Checkbox(label="Activation Checkpointing", value=True) activation_ckpt_factor = gr.Slider(0.3, 1.0, value=0.5, step=0.05, label="Activation Checkpointing Reduction Factor") activation_multiplier = gr.Slider(0.5, 4.0, value=1.5, step=0.1, label="Activation Memory Multiplier (heuristic)") with gr.Accordion("Parallelism / Memory Strategies", open=True): with gr.Row(): world_size = gr.Slider(1, 256, value=8, step=1, label="World Size (number of GPUs)") zero_stage = gr.Radio(choices=[("None", "none"), ("ZeRO-1", "zero1"), ("ZeRO-2", "zero2"), ("ZeRO-3", "zero3")], value="zero2", label="DeepSpeed ZeRO Stage") with gr.Row(): fsdp = gr.Checkbox(label="Enable FSDP (Fully Sharded Data Parallel)", value=False) fsdp_auto_wrap = gr.Checkbox(label="FSDP Auto Wrap Policy", value=True) cpu_offload = gr.Checkbox(label="CPU Offload (optimizer/params)", value=False) temp_allgather_overhead = gr.Slider(0.0, 0.5, value=0.1, step=0.01, label="Temp AllGather Overhead (fraction)") safety_headroom = gr.Slider(0.0, 0.5, value=0.1, step=0.01, label="Safety Headroom (fraction)") with gr.Accordion("Reinforcement Learning (GRPO) Settings", open=False): rl_grpo = gr.Checkbox(label="Enable RL with verifiable rewards (GRPO-like)", value=False) with gr.Row(): reference_model = gr.Checkbox(label="Load Reference Model (same as policy)", value=True) reference_precision = gr.Radio(choices=["fp16", "fp32"], value="fp16", label="Reference Model Precision") with gr.Row(): rollout_batch_size = gr.Slider(1, 8192, value=256, step=1, label="Rollout Batch Size (per step)") num_rollouts_per_step = gr.Slider(1, 64, value=4, step=1, label="Number of Rollouts per Step") with gr.Row(): reward_model_params_b = gr.Slider(0.0, 20.0, value=0.0, step=0.1, label="Reward Model Size (B params)") verifier_overhead_gb = gr.Slider(0.0, 40.0, value=0.0, step=0.5, label="Verifier / Extra Overheads (GB)") with gr.Accordion("Training Time & Cost", open=True): tokens_to_train_b = gr.Slider(0.1, 200.0, value=10.0, step=0.1, label="Total Tokens to Train (Billions)") throughput_toks_per_sec_per_gpu = gr.Slider(1000, 500000, value=20000, step=1000, label="Throughput (tokens/sec) per GPU") cost_per_gpu_hour = gr.Slider(0.0, 100.0, value=2.5, step=0.1, label="Cost per GPU-hour (USD)") estimate_btn = gr.Button("Estimate VRAM & Time", variant="primary") headline = gr.Markdown() with gr.Row(): breakdown_table = gr.Dataframe(headers=["component", "GB"], label="VRAM Breakdown (after sharding + overhead)", interactive=False) lora_table = gr.Dataframe(headers=["layer", "lora_params", "lora_params_millions", "memory_fp16_MB"], label="Per-layer LoRA Params", interactive=False) runtime = gr.Markdown(); raw_vram = gr.JSON(label="Raw VRAM JSON"); raw_time = gr.JSON(label="Raw Time/Cost JSON"); status = gr.Markdown(); tips = gr.Markdown() estimate_btn.click(fn=run_estimate, inputs=[ hf_model_id, training_mode, precision, lora_rank, lora_alpha, lora_targets, world_size, seq_len, micro_batch_size, grad_accum, optimizer, keep_master_fp32, activation_checkpointing, activation_ckpt_factor, activation_multiplier, zero_stage, fsdp, fsdp_auto_wrap, cpu_offload, reference_model, reference_precision, rl_grpo, rollout_batch_size, num_rollouts_per_step, reward_model_params_b, verifier_overhead_gb, temp_allgather_overhead, safety_headroom, tokens_to_train_b, throughput_toks_per_sec_per_gpu, cost_per_gpu_hour, do_per_layer_lora ], outputs=[headline, breakdown_table, lora_table, runtime, status, raw_vram, raw_time, tips]) return demo if __name__=="__main__": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860, share=False)