In [1]:
!nvidia-smi

Thu Aug 14 12:29:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch

In [9]:
class PyTorchCUDA101:
  def __init__(self) -> None:
    if not torch.cuda.is_available():
      raise RuntimeError("CUDA not available")
    self.device = torch.device("cuda")

    print(f"Using GPU: {torch.cuda.get_device_name()}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"PyTorch version: {torch.__version__}")

    print("\n" + "="*60)
    print("GPU MEMORY BASELINE: Understanding CUDA Overhead")
    print("="* 60)

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    baseline_mem = torch.cuda.memory_allocated() / 1024**2

    min_tensor = torch.ones((1,1), device="cuda")
    cuda_overhead = torch.cuda.memory_allocated() / 1024**2

    print(f"ðŸ“Š Memory before CUDA init: {baseline_mem:.1f} MB")
    print(f"ðŸ“Š Memory after CUDA init: {cuda_overhead:.1f} MB")
    print(f"ðŸŽ¯ CUDA kernel overhead: {cuda_overhead - baseline_mem:.1f} MB")

    print(f"\nðŸ’¡ Key Reality Check: CUDA kernels consume 1-2 GB regardless of your model size!")
    print(f"   This overhead is constant and unavoidable for any GPU computation.")
    print(f"   Additional memory used for buffers, intermediate results, and fragmentation")
    print(f"   makes precise memory calculations challenging - focus on relative improvements.")

In [10]:
p = PyTorchCUDA101()

Using GPU: Tesla T4
CUDA version: 12.4
PyTorch version: 2.6.0+cu124

GPU MEMORY BASELINE: Understanding CUDA Overhead
ðŸ“Š Memory before CUDA init: 0.0 MB
ðŸ“Š Memory after CUDA init: 0.0 MB
ðŸŽ¯ CUDA kernel overhead: 0.0 MB

ðŸ’¡ Key Reality Check: CUDA kernels consume 1-2 GB regardless of your model size!
   This overhead is constant and unavoidable for any GPU computation.
   Additional memory used for buffers, intermediate results, and fragmentation
   makes precise memory calculations challenging - focus on relative improvements.


In [11]:
!nvidia-smi

Thu Aug 14 12:38:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P0             25W /   70W |     120MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                