{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","gpuType":"L4","authorship_tag":"ABX9TyPRgRq+CmOekg2XVhIIL5we"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","source":["# -*- coding: utf-8 -*-\n","\"\"\"\n","Tutorial: Efficient Deep Learning Systems - Intro & Benchmarking\n","\n","Based on the lecture by Max Ryabinin.\n","This notebook explores fundamental concepts of GPU execution and benchmarking\n","in PyTorch, relevant for building efficient DL systems.\n","\"\"\"\n","\n","# %% [markdown]\n","# # Efficient Deep Learning Systems: Introduction & Benchmarking in PyTorch\n","#\n","# This notebook provides practical examples related to the introductory concepts covered in the \"Efficient Deep Learning Systems\" course. We'll touch upon:\n","#\n","# 1. **GPU Architecture Basics (Briefly):** Understanding the high-level differences between CPU and GPU execution.\n","# 2. **CUDA Execution Model:** How PyTorch interacts with the GPU (asynchronous execution).\n","# 3. **Memory Access:** Host-to-Device (H2D) and Device-to-Host (D2H) transfers, and pinned memory.\n","# 4. **Benchmarking:** How to measure performance correctly, considering synchronization and warm-up.\n","# 5. **Input Shape Effects:** How performance can vary based on tensor dimensions (Tile/Wave Quantization effects).\n","# 6. **PyTorch Utilities:** Using `torch.utils.benchmark`.\n","#\n","# **Prerequisites:**\n","# - PyTorch installed (`pip install torch torchvision torchaudio`)\n","# - A CUDA-enabled GPU recognized by PyTorch\n","# - `numpy` and `matplotlib` (`pip install numpy matplotlib`)\n","\n","# %%\n","import torch\n","import numpy as np\n","import time\n","import matplotlib.pyplot as plt\n","import timeit\n","\n","# %% [markdown]\n","# ## 1. Setup: Check GPU Availability\n","#\n","# First, let's verify that PyTorch can access the CUDA GPU.\n","\n","# %%\n","if torch.cuda.is_available():\n"," device = torch.device(\"cuda\")\n"," print(f\"CUDA device detected: {torch.cuda.get_device_name(0)}\")\n"," print(f\"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB\")\n","else:\n"," device = torch.device(\"cpu\")\n"," print(\"CUDA device not found, using CPU.\")\n"," # Many examples below are GPU-specific, execution might fail or be slow.\n","\n","# Ensure we have a GPU for the relevant parts\n","use_gpu = torch.cuda.is_available()\n","\n","# %% [markdown]\n","# ## 2. GPU Architecture & Execution Model Basics (Conceptual Review)\n","#\n","# - **CPU vs. GPU:** CPUs have a few powerful cores optimized for serial tasks, with large caches. GPUs have thousands of simpler cores optimized for parallel tasks (SIMT - Single Instruction, Multiple Thread). (See Slide 9)\n","# - **CUDA Model:** PyTorch uses CUDA (or ROCm on AMD) to run operations (kernels) on the GPU. Work is launched from the host (CPU) to the device (GPU). Kernels are executed by threads, grouped into blocks, which form a grid. These blocks are scheduled onto Streaming Multiprocessors (SMs). (See Slide 10)\n","# - **Warps:** Threads execute in groups called warps (typically 32 threads). All threads in a warp execute the same instruction. Branching within a warp (where different threads take different paths) can reduce efficiency as all paths might need to be executed. (See Slide 11)\n","# - **Key Takeaway:** GPUs achieve speed through massive parallelism. Efficient code keeps these cores busy.\n","\n","# %% [markdown]\n","# ## 3. Asynchronous Execution & Synchronization\n","#\n","# By default, CUDA operations (kernel launches, memory copies initiated from the CPU) are **asynchronous**. The CPU queues the operation and returns control immediately, *before* the GPU has finished. This hides latency and allows the CPU and GPU to work in parallel. (See Slide 16, 17)\n","#\n","# **Problem:** If we want to measure GPU execution time accurately, simply timing the Python call is wrong because it doesn't wait for the GPU to finish.\n","#\n","# **Solution:** We need to explicitly synchronize the CPU with the GPU using `torch.cuda.synchronize()`.\n","\n","# %%\n","if use_gpu:\n"," # Example: Matrix Multiplication\n"," size = 2048\n"," a = torch.randn(size, size, device=device)\n"," b = torch.randn(size, size, device=device)\n","\n"," # --- Incorrect Timing (No Synchronization) ---\n"," start_time = time.time()\n"," c = torch.matmul(a, b)\n"," end_time = time.time()\n"," print(f\"Incorrect MM time (no sync): {(end_time - start_time) * 1000:.4f} ms\")\n"," # This primarily measures kernel launch overhead, not execution time.\n","\n"," # --- Correct Timing (With Synchronization) ---\n"," # We need to sync *before* starting and *after* finishing\n"," # to ensure the measured interval only contains the operation.\n"," torch.cuda.synchronize() # Ensure previous work is done (optional here, but good practice)\n"," start_time = time.time()\n"," c = torch.matmul(a, b)\n"," torch.cuda.synchronize() # Wait for matmul kernel to finish\n"," end_time = time.time()\n"," print(f\"Correct MM time (with sync): {(end_time - start_time) * 1000:.4f} ms\")\n","\n"," # Using timeit (more robust for short durations)\n"," # Note: timeit runs the code multiple times. Synchronization is needed *within* the timed statement.\n"," stmt = \"torch.matmul(a, b); torch.cuda.synchronize()\"\n"," setup = \"import torch; size=2048; device=torch.device('cuda'); torch.cuda.synchronize(); a = torch.randn(size, size, device=device); b = torch.randn(size, size, device=device); torch.cuda.synchronize()\"\n"," num_runs = 10\n"," timer = timeit.Timer(stmt=stmt, setup=setup)\n"," avg_time_ms = timer.timeit(number=num_runs) / num_runs * 1000\n"," print(f\"Correct MM time (timeit, avg over {num_runs} runs): {avg_time_ms:.4f} ms\")\n","\n","else:\n"," print(\"Skipping asynchronous execution demo (requires GPU).\")\n","\n","# %% [markdown]\n","# ### Synchronization Triggered by Data Transfer\n","#\n","# Copying data from GPU to CPU (e.g., using `.cpu()` or `.item()`) implicitly synchronizes the specific stream the tensor is on, as the CPU needs the result. (See Slide 17)\n","\n","# %%\n","if use_gpu:\n"," size = 1024\n"," a_gpu = torch.randn(size, size, device=device)\n"," b_gpu = torch.randn(size, size, device=device)\n","\n"," # Perform an operation\n"," c_gpu = torch.matmul(a_gpu, b_gpu)\n","\n"," # Time the operation *plus* the copy back to CPU\n"," start_time = time.time()\n"," c_cpu = c_gpu.cpu() # This forces synchronization for c_gpu\n"," end_time = time.time()\n"," print(f\"Time for matmul + .cpu() transfer: {(end_time - start_time) * 1000:.4f} ms\")\n","\n"," # Similarly, .item() on a 1-element tensor synchronizes\n"," result_gpu = torch.sum(c_gpu)\n"," start_time = time.time()\n"," result_val = result_gpu.item() # This forces synchronization for result_gpu\n"," end_time = time.time()\n"," print(f\"Time for sum + .item(): {(end_time - start_time) * 1000:.4f} ms\")\n"," print(f\"Sum result: {result_val}\") # The CPU now has the result\n","\n","else:\n"," print(\"Skipping .cpu()/.item() sync demo (requires GPU).\")\n","\n","\n","# %% [markdown]\n","# ## 4. Benchmarking Considerations: Warm-up\n","#\n","# The first time a CUDA operation runs, there might be extra overhead (e.g., kernel loading, context initialization, memory allocations). Subsequent runs are often faster. Therefore, it's crucial to perform \"warm-up\" runs before starting actual measurements. (See Slide 18)\n","\n","# %%\n","if use_gpu:\n"," size = 1536\n"," a = torch.randn(size, size, device=device)\n"," b = torch.randn(size, size, device=device)\n","\n"," print(\"Benchmarking with Warm-up:\")\n","\n"," # Warm-up runs\n"," print(\"Warm-up runs...\")\n"," for _ in range(3):\n"," c = torch.matmul(a, b)\n"," torch.cuda.synchronize() # Ensure each warm-up completes\n","\n"," # Actual measurement runs\n"," print(\"Measurement runs...\")\n"," times = []\n"," num_runs = 10\n"," for _ in range(num_runs):\n"," torch.cuda.synchronize()\n"," start_time = time.time()\n"," c = torch.matmul(a, b)\n"," torch.cuda.synchronize()\n"," end_time = time.time()\n"," times.append((end_time - start_time) * 1000) # milliseconds\n","\n"," print(f\"Average time over {num_runs} measurement runs: {np.mean(times):.4f} ms\")\n"," print(f\"Standard deviation: {np.std(times):.4f} ms\")\n","\n","else:\n"," print(\"Skipping warm-up demo (requires GPU).\")\n","\n","# %% [markdown]\n","# ## 5. Memory Access: Host <-> Device Transfers & Pinned Memory\n","#\n","# Data must be explicitly copied between the CPU's RAM (host memory) and the GPU's RAM (device memory). These copies happen over the PCIe bus, which can be a bottleneck. (See Slide 14)\n","#\n","# - **H2D:** Host to Device (`.to(device)` or `.cuda()`)\n","# - **D2H:** Device to Host (`.cpu()`)\n","#\n","# Standard host memory allocated by Python/PyTorch is **pageable**. The OS can move it around in physical RAM or swap it to disk. CUDA transfers from pageable memory require an extra internal copy to a **pinned** (or page-locked) buffer before the transfer can start via DMA (Direct Memory Access).\n","#\n","# Allocating host memory as **pinned** avoids this extra copy, potentially speeding up H2D transfers. D2H transfers are often faster *into* pinned memory as well.\n","#\n","# **How to use pinned memory:**\n","# - Create a tensor directly in pinned memory: `torch.empty(..., pin_memory=True)`\n","# - Copy an existing CPU tensor to pinned memory: `cpu_tensor.pin_memory()`\n","# - Use `pin_memory=True` in `torch.utils.data.DataLoader`.\n","\n","# %%\n","if use_gpu:\n"," size_bytes = 128 * 1024 * 1024 # 128 MB\n"," elements = size_bytes // 4 # Assuming float32 (4 bytes)\n"," cpu_tensor_pageable = torch.randn(elements, device='cpu')\n"," # Create a tensor directly in pinned memory\n"," cpu_tensor_pinned = torch.empty(elements, device='cpu', pin_memory=True)\n"," cpu_tensor_pinned.copy_(cpu_tensor_pageable) # Copy data into it\n","\n"," # --- Benchmark H2D transfer ---\n"," num_runs = 10\n"," warmup = 2\n","\n"," def benchmark_h2d(tensor):\n"," times = []\n"," # Warmup\n"," for _ in range(warmup):\n"," gpu_tensor = tensor.to(device)\n"," torch.cuda.synchronize()\n"," # Measure\n"," for _ in range(num_runs):\n"," torch.cuda.synchronize()\n"," start = time.time()\n"," gpu_tensor = tensor.to(device)\n"," torch.cuda.synchronize()\n"," end = time.time()\n"," times.append((end - start) * 1000) # ms\n"," del gpu_tensor # Free GPU memory\n"," torch.cuda.empty_cache()\n"," return np.mean(times)\n","\n"," time_pageable = benchmark_h2d(cpu_tensor_pageable)\n"," time_pinned = benchmark_h2d(cpu_tensor_pinned)\n","\n"," print(f\"H2D Transfer Time ({size_bytes / (1024**2):.0f} MB):\")\n"," print(f\" Pageable Memory: {time_pageable:.4f} ms\")\n"," print(f\" Pinned Memory: {time_pinned:.4f} ms\")\n"," if time_pinned < time_pageable:\n"," print(f\" Speedup: {time_pageable / time_pinned:.2f}x\")\n"," else:\n"," print(\" (No speedup observed in this run)\")\n","\n"," # --- Benchmark D2H transfer ---\n"," gpu_tensor = cpu_tensor_pageable.to(device) # Start with data on GPU\n","\n"," def benchmark_d2h(target_cpu_tensor):\n"," times = []\n"," # Warmup\n"," for _ in range(warmup):\n"," target_cpu_tensor.copy_(gpu_tensor) # Copy D2H into target\n"," torch.cuda.synchronize() # Not strictly needed for D2H but good practice\n"," # Measure\n"," for _ in range(num_runs):\n"," # No sync needed before D2H usually, but sync *after* to ensure completion\n"," start = time.time()\n"," target_cpu_tensor.copy_(gpu_tensor)\n"," # Sync isn't strictly necessary for D2H timing itself,\n"," # as copy_ implies some level of sync, but let's be safe.\n"," torch.cuda.synchronize()\n"," end = time.time()\n"," times.append((end - start) * 1000)\n"," return np.mean(times)\n","\n"," time_d2h_pageable = benchmark_d2h(torch.empty_like(cpu_tensor_pageable))\n"," time_d2h_pinned = benchmark_d2h(torch.empty_like(cpu_tensor_pinned, pin_memory=True))\n","\n"," print(f\"\\nD2H Transfer Time ({size_bytes / (1024**2):.0f} MB):\")\n"," print(f\" To Pageable Memory: {time_d2h_pageable:.4f} ms\")\n"," print(f\" To Pinned Memory: {time_d2h_pinned:.4f} ms\")\n"," if time_d2h_pinned < time_d2h_pageable:\n"," print(f\" Speedup: {time_d2h_pageable / time_d2h_pinned:.2f}x\")\n"," else:\n"," print(\" (No speedup observed in this run)\")\n","\n"," del gpu_tensor\n"," torch.cuda.empty_cache()\n","\n","else:\n"," print(\"Skipping memory transfer demo (requires GPU).\")\n","\n","# %% [markdown]\n","# **Note on Pinned Memory:** Allocating too much pinned memory can degrade overall system performance because it reduces the amount of memory the OS can manage (e.g., page out). Use it judiciously, primarily for buffers involved in frequent H2D/D2H transfers (like in data loading pipelines).\n","\n","# %% [markdown]\n","# ## 6. Input Shape Effects & `cudnn.benchmark`\n","#\n","# GPU performance, especially for operations like convolutions and matrix multiplications handled by libraries like cuDNN, can be sensitive to input tensor shapes (sizes, strides). (See Slides 12, 13)\n","#\n","# - **Tile/Wave Quantization:** Hardware resources (SMs, memory bandwidth) are often utilized most efficiently when problem sizes align well with the hardware's internal tiling or scheduling strategies. Performance might not scale smoothly and can sometimes exhibit step-like behavior as dimensions change.\n","# - **`torch.backends.cudnn.benchmark = True`:** This tells cuDNN to run benchmarks for different algorithms for the specific input sizes encountered during the *first* pass of an operation (like `nn.Conv2d`). It then caches the fastest algorithm for those specific sizes.\n","#\n","# **Use `cudnn.benchmark = True` if:**\n","# - Your input sizes (batch size, image dimensions, etc.) are **fixed** throughout training/inference.\n","#\n","# **Avoid `cudnn.benchmark = True` if:**\n","# - Your input sizes vary often (e.g., variable batch sizes, different image resolutions). The overhead of benchmarking each new size can outweigh the benefits. (See Slide 17)\n","\n","# %%\n","if use_gpu and torch.backends.cudnn.is_available():\n"," print(\"Benchmarking MatMul with varying sizes...\")\n"," fixed_dim = 2048\n"," variable_dims = list(range(1024, 3072, 64)) # Vary one dimension\n"," times_default = []\n"," times_benchmark_mode = []\n","\n"," # --- Default Mode ---\n"," torch.backends.cudnn.benchmark = False\n"," print(\"Running with cudnn.benchmark = False\")\n"," a = torch.randn(fixed_dim, fixed_dim, device=device) # Reusable tensor\n"," for dim in variable_dims:\n"," b = torch.randn(fixed_dim, dim, device=device)\n"," # Warmup\n"," for _ in range(2): torch.matmul(a, b); torch.cuda.synchronize()\n"," # Measure\n"," torch.cuda.synchronize()\n"," start = time.time()\n"," for _ in range(5): # Average over a few runs\n"," c = torch.matmul(a, b)\n"," torch.cuda.synchronize()\n"," end = time.time()\n"," times_default.append(((end - start) / 5) * 1000) # Avg time in ms\n"," del b, c # Free memory\n"," torch.cuda.empty_cache()\n","\n"," # --- Benchmark Mode ---\n"," torch.backends.cudnn.benchmark = True\n"," # NOTE: In benchmark mode, the *first* time a size is seen incurs overhead.\n"," # Subsequent calls with the *same* size should be faster.\n"," # Our loop uses different sizes, so we might not see the full benefit here,\n"," # and might even see slowdown due to repeated benchmarking.\n"," # This mode is best when sizes are *constant*.\n"," print(\"Running with cudnn.benchmark = True\")\n"," a = torch.randn(fixed_dim, fixed_dim, device=device) # Recreate to reset potential cache\n"," for dim in variable_dims:\n"," b = torch.randn(fixed_dim, dim, device=device)\n"," # Allow benchmark to run on first pass, then measure\n"," torch.cuda.synchronize()\n"," start = time.time()\n"," # The first call within the timing loop might trigger benchmarking\n"," for _ in range(5):\n"," c = torch.matmul(a, b)\n"," torch.cuda.synchronize()\n"," end = time.time()\n"," times_benchmark_mode.append(((end - start) / 5) * 1000) # Avg time in ms\n"," del b, c # Free memory\n"," torch.cuda.empty_cache()\n","\n"," # --- Plotting ---\n"," plt.figure(figsize=(10, 6))\n"," plt.plot(variable_dims, times_default, label='cudnn.benchmark = False', marker='o')\n"," # Plot benchmark mode only if useful (might be noisy here)\n"," # plt.plot(variable_dims, times_benchmark_mode, label='cudnn.benchmark = True', marker='x')\n"," plt.xlabel(\"Variable Dimension Size (N in KxN MatMul)\")\n"," plt.ylabel(\"Average Execution Time (ms)\")\n"," plt.title(\"MatMul Performance vs. Input Shape (M=2048, K=2048)\")\n"," plt.legend()\n"," plt.grid(True)\n"," plt.show()\n","\n"," print(\"\\nNote: Performance variations can be subtle and depend heavily on\")\n"," print(\"the specific GPU, CUDA version, and operation.\")\n"," print(\"The 'steps' or non-smoothness relate to Tile/Wave Quantization effects.\")\n"," print(\"cudnn.benchmark=True is most effective for *fixed* input sizes.\")\n","\n"," # Reset benchmark mode\n"," torch.backends.cudnn.benchmark = False\n","else:\n"," print(\"Skipping cudnn.benchmark demo (requires GPU and cuDNN).\")\n","\n","\n","# %% [markdown]\n","# ## 7. `torch.utils.benchmark`: A Better Tool for Microbenchmarking\n","#\n","# Manually handling warm-up, synchronization, and multiple runs is tedious and error-prone. PyTorch provides `torch.utils.benchmark` for more robust microbenchmarking. (See Slide 18)\n","\n","# %%\n","try:\n"," import torch.utils.benchmark as benchmark\n","except ImportError:\n"," print(\"torch.utils.benchmark not available (requires recent PyTorch version).\")\n"," benchmark = None\n","\n","if use_gpu and benchmark:\n"," size = 2048\n"," a = torch.randn(size, size, device=device)\n"," b = torch.randn(size, size, device=device)\n"," stmt = \"torch.matmul(a, b)\"\n","\n"," # Basic usage\n"," timer = benchmark.Timer(\n"," stmt=stmt,\n"," globals={'a': a, 'b': b}\n"," )\n","\n"," # Run the benchmark\n"," measurement = timer.timeit(100) # Run stmt 100 times for timing\n","\n"," # Print the results (includes mean, median, stddev)\n"," print(\"Benchmarking MatMul with torch.utils.benchmark:\")\n"," print(measurement)\n","\n"," # Example with setup and different threads\n"," timer_with_setup = benchmark.Timer(\n"," stmt=\"torch.matmul(x, y)\",\n"," setup=\"x = torch.randn(s, s, device=dev); y = torch.randn(s, s, device=dev)\",\n"," globals={'s': size, 'dev': device},\n"," num_threads=1 # Control CPU threads used by PyTorch (relevant for some ops)\n"," )\n"," measurement_setup = timer_with_setup.timeit(50)\n"," print(\"\\nBenchmarking with setup:\")\n"," print(measurement_setup)\n","\n"," # Comparing two versions\n"," label = \"MatMul\"\n"," sub_label = f\"{size}x{size}\"\n"," results = []\n"," # Version 1 (standard)\n"," results.append(benchmark.Timer(stmt=stmt, globals={'a': a, 'b': b}, label=label, description=\"Standard\").blocked_autorange())\n"," # Version 2 (let's pretend we optimize - e.g., fused op if available)\n"," # For demo, just run the same thing again\n"," results.append(benchmark.Timer(stmt=stmt, globals={'a': a, 'b': b}, label=label, description=\"Hypothetical Opt\").blocked_autorange())\n","\n"," compare = benchmark.Compare(results)\n"," print(\"\\nComparing implementations:\")\n"," compare.print()\n","\n","\n","else:\n"," if not benchmark:\n"," print(\"torch.utils.benchmark not imported.\")\n"," else:\n"," print(\"Skipping torch.utils.benchmark demo (requires GPU).\")\n","\n","\n","# %% [markdown]\n","# ## 8. Conclusion & Key Takeaways\n","#\n","# Understanding the basics of how code executes on a GPU is crucial for writing efficient deep learning systems.\n","#\n","# - **Asynchronous Execution:** Be aware of it and use `torch.cuda.synchronize()` for accurate timing.\n","# - **Memory Transfers:** Minimize H2D/D2H copies. Use pinned memory (`pin_memory=True`) strategically, especially in data loaders, to potentially speed up H2D transfers.\n","# - **Benchmarking:** Always perform warm-up runs. Use tools like `timeit` or preferably `torch.utils.benchmark` for reliable measurements.\n","# - **Input Shapes Matter:** Performance isn't always smooth. Be mindful of Tile/Wave quantization effects. Use `torch.backends.cudnn.benchmark = True` only when input shapes are constant.\n","# - **Don't Overoptimize Prematurely:** Profile your code to find the real bottlenecks before spending time on micro-optimizations (Slide 18). Focus on algorithmic improvements, data loading, and minimizing unnecessary work first.\n","#\n","# This introduction lays the groundwork for exploring more advanced topics like profiling, distributed training, and model optimization covered later in the course.\n","\n","# %%\n","print(\"End of tutorial.\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"id":"49zOjwgVrDQ-","executionInfo":{"status":"ok","timestamp":1745903385376,"user_tz":-420,"elapsed":3401,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"923fb0a6-ed6b-4205-e209-e3438891ea4e"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["CUDA device detected: NVIDIA L4\n","Total GPU Memory: 22.16 GB\n","Incorrect MM time (no sync): 0.5832 ms\n","Correct MM time (with sync): 1.2836 ms\n","Correct MM time (timeit, avg over 10 runs): 1.0801 ms\n","Time for matmul + .cpu() transfer: 3.9089 ms\n","Time for sum + .item(): 0.0453 ms\n","Sum result: -13279.021484375\n","Benchmarking with Warm-up:\n","Warm-up runs...\n","Measurement runs...\n","Average time over 10 measurement runs: 0.4783 ms\n","Standard deviation: 0.0033 ms\n","H2D Transfer Time (128 MB):\n"," Pageable Memory: 28.5886 ms\n"," Pinned Memory: 10.9343 ms\n"," Speedup: 2.61x\n","\n","D2H Transfer Time (128 MB):\n"," To Pageable Memory: 26.6042 ms\n"," To Pinned Memory: 10.2439 ms\n"," Speedup: 2.60x\n","Benchmarking MatMul with varying sizes...\n","Running with cudnn.benchmark = False\n","Running with cudnn.benchmark = True\n"]},{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["\n","Note: Performance variations can be subtle and depend heavily on\n","the specific GPU, CUDA version, and operation.\n","The 'steps' or non-smoothness relate to Tile/Wave Quantization effects.\n","cudnn.benchmark=True is most effective for *fixed* input sizes.\n","Benchmarking MatMul with torch.utils.benchmark:\n","\n","torch.matmul(a, b)\n"," 1.25 ms\n"," 1 measurement, 100 runs , 1 thread\n","\n","Benchmarking with setup:\n","\n","torch.matmul(x, y)\n","setup: x = torch.randn(s, s, device=dev); y = torch.randn(s, s, device=dev)\n"," 1.36 ms\n"," 1 measurement, 50 runs , 1 thread\n","\n","Comparing implementations:\n","[------------------------ MatMul ------------------------]\n"," | Standard | Hypothetical Opt\n","1 threads: -----------------------------------------------\n"," torch.matmul(a, b) | 1.4 | 1.4 \n","\n","Times are in milliseconds (ms).\n","\n","End of tutorial.\n"]}]},{"cell_type":"code","source":["import torch\n","\n","torch.cuda.is_available()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Cdu35X-xrZvK","executionInfo":{"status":"ok","timestamp":1745899326222,"user_tz":-420,"elapsed":12,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"52bb0f29-a4a6-4c42-be49-48dc3973995e"},"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":8}]},{"cell_type":"code","source":["torch.cuda.get_device_properties(0)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oDctqTgdtYlJ","executionInfo":{"status":"ok","timestamp":1745899336494,"user_tz":-420,"elapsed":16,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"53c06086-ea7d-4035-fe8a-c0d218878143"},"execution_count":9,"outputs":[{"output_type":"execute_result","data":{"text/plain":["_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22692MB, multi_processor_count=58, uuid=0363b84d-093e-e8eb-498e-07022b1f6e63, L2_cache_size=48MB)"]},"metadata":{},"execution_count":9}]},{"cell_type":"code","source":["def allocate_empty_tensor(dim_size):\n"," a = torch.empty(4096, dim_size, dtype=torch.float32, device=\"cuda\")\n",""],"metadata":{"id":"IUVmAJ4ttbLG","executionInfo":{"status":"ok","timestamp":1745899546351,"user_tz":-420,"elapsed":41,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}}},"execution_count":10,"outputs":[]},{"cell_type":"code","source":["allocate_empty_tensor(2048)\n"],"metadata":{"id":"X2XZCGMnuOZh","executionInfo":{"status":"ok","timestamp":1745899553389,"user_tz":-420,"elapsed":1,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}}},"execution_count":11,"outputs":[]},{"cell_type":"code","source":["torch.cuda.memory_allocated(0)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2hffbm7LuQIN","executionInfo":{"status":"ok","timestamp":1745899567450,"user_tz":-420,"elapsed":43,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"3bf5a471-98b0-4c25-9d3e-f43266ec06ad"},"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":["54657536"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["torch.cuda.memory_reserved()\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nowmiSyKuTjP","executionInfo":{"status":"ok","timestamp":1745899597420,"user_tz":-420,"elapsed":42,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"bd6ae554-d179-4d6b-dfa6-ecf73ff0928a"},"execution_count":13,"outputs":[{"output_type":"execute_result","data":{"text/plain":["165675008"]},"metadata":{},"execution_count":13}]},{"cell_type":"code","source":["!nvidia-smi\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"eO8UZHhCua3h","executionInfo":{"status":"ok","timestamp":1745899609867,"user_tz":-420,"elapsed":214,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"c3cbdf69-584b-4453-ce8e-759d0b62d73f"},"execution_count":14,"outputs":[{"output_type":"stream","name":"stdout","text":["Tue Apr 29 04:06:49 2025 \n","+-----------------------------------------------------------------------------------------+\n","| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n","|-----------------------------------------+------------------------+----------------------+\n","| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|=========================================+========================+======================|\n","| 0 NVIDIA L4 Off | 00000000:00:03.0 Off | 0 |\n","| N/A 59C P0 29W / 72W | 387MiB / 23034MiB | 0% Default |\n","| | | N/A |\n","+-----------------------------------------+------------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=========================================================================================|\n","+-----------------------------------------------------------------------------------------+\n"]}]},{"cell_type":"code","source":["torch.cuda.empty_cache()\n","torch.cuda.memory_reserved()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ona5fnPWud3V","executionInfo":{"status":"ok","timestamp":1745899630394,"user_tz":-420,"elapsed":40,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"e9fd28f7-8c0b-4bf6-d9ae-eae0e2cbbebf"},"execution_count":15,"outputs":[{"output_type":"execute_result","data":{"text/plain":["81788928"]},"metadata":{},"execution_count":15}]},{"cell_type":"code","source":["!nvidia-smi"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"IwIzk9oPui6z","executionInfo":{"status":"ok","timestamp":1745899640946,"user_tz":-420,"elapsed":206,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"42d0b38c-c194-4704-b2be-27761c5311ad"},"execution_count":16,"outputs":[{"output_type":"stream","name":"stdout","text":["Tue Apr 29 04:07:20 2025 \n","+-----------------------------------------------------------------------------------------+\n","| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n","|-----------------------------------------+------------------------+----------------------+\n","| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|=========================================+========================+======================|\n","| 0 NVIDIA L4 Off | 00000000:00:03.0 Off | 0 |\n","| N/A 60C P0 30W / 72W | 307MiB / 23034MiB | 0% Default |\n","| | | N/A |\n","+-----------------------------------------+------------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=========================================================================================|\n","+-----------------------------------------------------------------------------------------+\n"]}]},{"cell_type":"code","source":["allocate_empty_tensor(2048)\n","torch.cuda.memory_reserved()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_H-AF34TuldA","executionInfo":{"status":"ok","timestamp":1745899660225,"user_tz":-420,"elapsed":44,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"a4dc55cc-0f26-41b0-f018-5d31c2f6734e"},"execution_count":17,"outputs":[{"output_type":"execute_result","data":{"text/plain":["115343360"]},"metadata":{},"execution_count":17}]},{"cell_type":"code","source":["allocate_empty_tensor(1024)\n","torch.cuda.memory_reserved()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Xe6GNbzTuqM1","executionInfo":{"status":"ok","timestamp":1745899671295,"user_tz":-420,"elapsed":12,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"3f6520f2-8099-4a69-f7c4-8a32509430df"},"execution_count":18,"outputs":[{"output_type":"execute_result","data":{"text/plain":["115343360"]},"metadata":{},"execution_count":18}]},{"cell_type":"code","source":["allocate_empty_tensor(3072)\n","torch.cuda.memory_reserved()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Fz0aLKJIus6M","executionInfo":{"status":"ok","timestamp":1745899692665,"user_tz":-420,"elapsed":41,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"7df4f4f6-41af-4e0a-9343-a0c7f4c9c3b4"},"execution_count":19,"outputs":[{"output_type":"execute_result","data":{"text/plain":["165675008"]},"metadata":{},"execution_count":19}]},{"cell_type":"code","source":["memory_stats = torch.cuda.memory_stats()\n","print(memory_stats[\"active.all.allocated\"])\n","print(memory_stats[\"active.all.current\"])\n","print(memory_stats[\"active.all.peak\"])\n","print(memory_stats[\"reserved_bytes.all.current\"])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"K6yKS9GjuyHq","executionInfo":{"status":"ok","timestamp":1745899734063,"user_tz":-420,"elapsed":11,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"6ad7fc76-8d89-4ea8-dcc4-4a02fe13d305"},"execution_count":20,"outputs":[{"output_type":"stream","name":"stdout","text":["1321\n","7\n","17\n","165675008\n"]}]},{"cell_type":"code","source":["torch.cuda.empty_cache()\n","print(torch.cuda.memory_stats()[\"reserved_bytes.all.current\"])\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-kWMLPjCu8O8","executionInfo":{"status":"ok","timestamp":1745899751380,"user_tz":-420,"elapsed":46,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"66fdb905-7e0c-4d34-a4d6-0b08078fa4e7"},"execution_count":21,"outputs":[{"output_type":"stream","name":"stdout","text":["81788928\n"]}]},{"cell_type":"code","source":["def batched_dot_mul_sum(a, b):\n"," \"\"\"Computes batched dot by multiplying and summing\"\"\"\n"," return a.mul(b).sum(-1)\n","\n","\n","def batched_dot_bmm(a, b):\n"," \"\"\"Computes batched dot by reducing to bmm\"\"\"\n"," a = a.reshape(-1, 1, a.shape[-1])\n"," b = b.reshape(-1, b.shape[-1], 1)\n"," return torch.bmm(a, b).flatten(-3)\n","\n","\n","# Input for benchmarking\n","x = torch.randn(10000, 64)\n","\n","# Ensure that both functions compute the same output\n","assert batched_dot_mul_sum(x, x).allclose(batched_dot_bmm(x, x))"],"metadata":{"id":"z3GO7THjvAc-","executionInfo":{"status":"ok","timestamp":1745899767244,"user_tz":-420,"elapsed":23,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}}},"execution_count":22,"outputs":[]},{"cell_type":"code","source":["import timeit\n","\n","t0 = timeit.Timer(\n"," stmt=\"batched_dot_mul_sum(x, x)\",\n"," setup=\"from __main__ import batched_dot_mul_sum\",\n"," globals={\"x\": x},\n",")\n","\n","t1 = timeit.Timer(\n"," stmt=\"batched_dot_bmm(x, x)\",\n"," setup=\"from __main__ import batched_dot_bmm\",\n"," globals={\"x\": x},\n",")\n","\n","print(f\"mul_sum(x, x): {t0.timeit(100) / 100 * 1e6:>5.1f} us\")\n","print(f\"bmm(x, x): {t1.timeit(100) / 100 * 1e6:>5.1f} us\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"EJ4DVJLXvEVP","executionInfo":{"status":"ok","timestamp":1745899802030,"user_tz":-420,"elapsed":43,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"63f907ab-a840-4955-8e25-2a9a48af889a"},"execution_count":23,"outputs":[{"output_type":"stream","name":"stdout","text":["mul_sum(x, x): 136.8 us\n","bmm(x, x): 191.7 us\n"]}]},{"cell_type":"code","source":["%timeit batched_dot_mul_sum(x, x)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"e8iQeITEvM0g","executionInfo":{"status":"ok","timestamp":1745899825900,"user_tz":-420,"elapsed":6414,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"f7cbd1eb-d894-4f0a-fdb6-3c07574faf70"},"execution_count":24,"outputs":[{"output_type":"stream","name":"stdout","text":["77.7 µs ± 969 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"]}]},{"cell_type":"code","source":["%timeit batched_dot_bmm(x, x)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qQaw9pBvvRF6","executionInfo":{"status":"ok","timestamp":1745899841249,"user_tz":-420,"elapsed":13410,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"f9ae087e-6aff-4c15-8216-bce09b290202"},"execution_count":25,"outputs":[{"output_type":"stream","name":"stdout","text":["164 µs ± 10.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"]}]},{"cell_type":"code","source":["import torch.utils.benchmark as benchmark\n","\n","t0 = benchmark.Timer(\n"," stmt=\"batched_dot_mul_sum(x, x)\",\n"," setup=\"from __main__ import batched_dot_mul_sum\",\n"," globals={\"x\": x},\n",")\n","\n","t1 = benchmark.Timer(\n"," stmt=\"batched_dot_bmm(x, x)\",\n"," setup=\"from __main__ import batched_dot_bmm\",\n"," globals={\"x\": x},\n",")\n","\n","print(t0.timeit(100))\n","print(t1.timeit(100))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"i2oT8JOrvTIf","executionInfo":{"status":"ok","timestamp":1745899858165,"user_tz":-420,"elapsed":135,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"a1180041-e9a3-4476-f34d-0cc6ada58d50"},"execution_count":26,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","batched_dot_mul_sum(x, x)\n","setup: from __main__ import batched_dot_mul_sum\n"," 390.79 us\n"," 1 measurement, 100 runs , 1 thread\n","\n","batched_dot_bmm(x, x)\n","setup: from __main__ import batched_dot_bmm\n"," 876.46 us\n"," 1 measurement, 100 runs , 1 thread\n"]}]},{"cell_type":"code","source":["# in addition, we can set the number of threads for CPU computations\n","num_threads = torch.get_num_threads()\n","print(f\"Benchmarking on {num_threads} threads\")\n","\n","t0 = benchmark.Timer(\n"," stmt=\"batched_dot_mul_sum(x, x)\",\n"," setup=\"from __main__ import batched_dot_mul_sum\",\n"," globals={\"x\": x},\n"," num_threads=num_threads,\n"," label=\"Multithreaded batch dot\",\n"," sub_label=\"Implemented using mul and sum\",\n",")\n","\n","t1 = benchmark.Timer(\n"," stmt=\"batched_dot_bmm(x, x)\",\n"," setup=\"from __main__ import batched_dot_bmm\",\n"," globals={\"x\": x},\n"," num_threads=num_threads,\n"," label=\"Multithreaded batch dot\",\n"," sub_label=\"Implemented using bmm\",\n",")\n","\n","print(t0.timeit(100))\n","print(t1.timeit(100))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"HAcbJQrUvagK","executionInfo":{"status":"ok","timestamp":1745899894949,"user_tz":-420,"elapsed":46,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"59c38bb4-31db-436d-ad4c-753fe16dd805"},"execution_count":27,"outputs":[{"output_type":"stream","name":"stdout","text":["Benchmarking on 6 threads\n","\n","Multithreaded batch dot: Implemented using mul and sum\n","setup: from __main__ import batched_dot_mul_sum\n"," 81.66 us\n"," 1 measurement, 100 runs , 6 threads\n","\n","Multithreaded batch dot: Implemented using bmm\n","setup: from __main__ import batched_dot_bmm\n"," 158.29 us\n"," 1 measurement, 100 runs , 6 threads\n"]}]},{"cell_type":"code","source":["# we can change it globally for PyTorch and measure the impact\n","prev_num_threads = num_threads\n","torch.set_num_threads(2)\n","\n","num_threads = torch.get_num_threads()\n","print(f\"Benchmarking on {num_threads} threads\")\n","\n","t0 = benchmark.Timer(\n"," stmt=\"batched_dot_mul_sum(x, x)\",\n"," setup=\"from __main__ import batched_dot_mul_sum\",\n"," globals={\"x\": x},\n"," num_threads=num_threads,\n"," label=\"Multithreaded batch dot\",\n"," sub_label=\"Implemented using mul and sum\",\n",")\n","\n","t1 = benchmark.Timer(\n"," stmt=\"batched_dot_bmm(x, x)\",\n"," setup=\"from __main__ import batched_dot_bmm\",\n"," globals={\"x\": x},\n"," num_threads=num_threads,\n"," label=\"Multithreaded batch dot\",\n"," sub_label=\"Implemented using bmm\",\n",")\n","\n","print(t0.timeit(100))\n","print(t1.timeit(100))\n","# in this case, we don't get any speedup, likely due to the overhead\n","\n","torch.set_num_threads(prev_num_threads)\n",""],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QY1JvrpnvjgM","executionInfo":{"status":"ok","timestamp":1745899933895,"user_tz":-420,"elapsed":75,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"6c24ba27-68de-448e-e41c-148aab7bbdfc"},"execution_count":28,"outputs":[{"output_type":"stream","name":"stdout","text":["Benchmarking on 2 threads\n","\n","Multithreaded batch dot: Implemented using mul and sum\n","setup: from __main__ import batched_dot_mul_sum\n"," 206.36 us\n"," 1 measurement, 100 runs , 2 threads\n","\n","Multithreaded batch dot: Implemented using bmm\n","setup: from __main__ import batched_dot_bmm\n"," 450.58 us\n"," 1 measurement, 100 runs , 2 threads\n"]}]},{"cell_type":"code","source":["# by the way, what CPU do we have?\n","!lscpu"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"rftX9o8TvtAS","executionInfo":{"status":"ok","timestamp":1745899949819,"user_tz":-420,"elapsed":105,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"2ba39bb7-f1ff-49a5-af83-4fc04a9ea19c"},"execution_count":29,"outputs":[{"output_type":"stream","name":"stdout","text":["Architecture: x86_64\n"," CPU op-mode(s): 32-bit, 64-bit\n"," Address sizes: 46 bits physical, 48 bits virtual\n"," Byte Order: Little Endian\n","CPU(s): 12\n"," On-line CPU(s) list: 0-11\n","Vendor ID: GenuineIntel\n"," Model name: Intel(R) Xeon(R) CPU @ 2.20GHz\n"," CPU family: 6\n"," Model: 85\n"," Thread(s) per core: 2\n"," Core(s) per socket: 6\n"," Socket(s): 1\n"," Stepping: 7\n"," BogoMIPS: 4400.45\n"," Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m\n"," ca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht sysc\n"," all nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xt\n"," opology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq\n"," ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt\n"," aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dno\n"," wprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enh\n"," anced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms\n"," invpcid rtm mpx avx512f avx512dq rdseed adx smap clfl\n"," ushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec\n"," xgetbv1 xsaves arat avx512_vnni md_clear arch_capabil\n"," ities\n","Virtualization features: \n"," Hypervisor vendor: KVM\n"," Virtualization type: full\n","Caches (sum of all): \n"," L1d: 192 KiB (6 instances)\n"," L1i: 192 KiB (6 instances)\n"," L2: 6 MiB (6 instances)\n"," L3: 38.5 MiB (1 instance)\n","NUMA: \n"," NUMA node(s): 1\n"," NUMA node0 CPU(s): 0-11\n","Vulnerabilities: \n"," Gather data sampling: Not affected\n"," Itlb multihit: Not affected\n"," L1tf: Not affected\n"," Mds: Not affected\n"," Meltdown: Not affected\n"," Mmio stale data: Vulnerable\n"," Reg file data sampling: Not affected\n"," Retbleed: Vulnerable\n"," Spec rstack overflow: Not affected\n"," Spec store bypass: Vulnerable\n"," Spectre v1: Vulnerable: __user pointer sanitization and usercopy b\n"," arriers only; no swapgs barriers\n"," Spectre v2: Vulnerable; IBPB: disabled; STIBP: disabled; PBRSB-eIB\n"," RS: Vulnerable; BHI: Vulnerable\n"," Srbds: Not affected\n"," Tsx async abort: Vulnerable\n"]}]},{"cell_type":"code","source":["import timeit\n","\n","x = torch.randn(10000, 1024, device=\"cuda\")\n","\n","t0 = timeit.Timer(\n"," stmt=\"batched_dot_mul_sum(x, x)\",\n"," setup=\"from __main__ import batched_dot_mul_sum\",\n"," globals={\"x\": x},\n",")\n","\n","t1 = timeit.Timer(\n"," stmt=\"batched_dot_bmm(x, x)\",\n"," setup=\"from __main__ import batched_dot_bmm\",\n"," globals={\"x\": x},\n",")\n","\n","# Ran each twice to show difference before/after warmup\n","print(f\"mul_sum(x, x): {t0.timeit(100) / 100 * 1e6:>5.1f} us\")\n","print(f\"mul_sum(x, x): {t0.timeit(100) / 100 * 1e6:>5.1f} us\")\n","print(f\"bmm(x, x): {t1.timeit(100) / 100 * 1e6:>5.1f} us\")\n","print(f\"bmm(x, x): {t1.timeit(100) / 100 * 1e6:>5.1f} us\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0zIF-8sEvw4w","executionInfo":{"status":"ok","timestamp":1745899974688,"user_tz":-420,"elapsed":109,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"f4e47cd7-c612-4fd4-e47b-72fc0207521c"},"execution_count":30,"outputs":[{"output_type":"stream","name":"stdout","text":["mul_sum(x, x): 292.4 us\n","mul_sum(x, x): 23.5 us\n","bmm(x, x): 703.0 us\n","bmm(x, x): 24.6 us\n"]}]},{"cell_type":"code","source":["t0 = benchmark.Timer(\n"," stmt=\"batched_dot_mul_sum(x, x)\",\n"," setup=\"from __main__ import batched_dot_mul_sum\",\n"," globals={\"x\": x},\n",")\n","\n","t1 = benchmark.Timer(\n"," stmt=\"batched_dot_bmm(x, x)\",\n"," setup=\"from __main__ import batched_dot_bmm\",\n"," globals={\"x\": x},\n",")\n","\n","# Run only once since benchmark module does warmup for us\n","print(t0.timeit(100))\n","print(t1.timeit(100))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nslgr1dDv29T","executionInfo":{"status":"ok","timestamp":1745900020680,"user_tz":-420,"elapsed":48,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"799b4fbb-e446-457f-97fa-94324f7457a5"},"execution_count":31,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","batched_dot_mul_sum(x, x)\n","setup: from __main__ import batched_dot_mul_sum\n"," 363.77 us\n"," 1 measurement, 100 runs , 1 thread\n","\n","batched_dot_bmm(x, x)\n","setup: from __main__ import batched_dot_bmm\n"," 46.30 us\n"," 1 measurement, 100 runs , 1 thread\n"]}]},{"cell_type":"code","source":["# -*- coding: utf-8 -*-\n","\"\"\"\n","Concise Tutorial: PyTorch GPU Essentials & Benchmarking\n","\n","Based on seminar_01.ipynb, focusing on core concepts for efficient GPU usage.\n","Covers asynchronous execution, memory management, benchmarking, streams,\n","graphs, and debugging tips.\n","\"\"\"\n","\n","# %% [markdown]\n","# # PyTorch GPU Essentials & Benchmarking: A Concise Tutorial\n","#\n","# This tutorial covers fundamental concepts and best practices for working efficiently with PyTorch on CUDA-enabled GPUs. We'll explore:\n","#\n","# 1. **Asynchronous Execution:** Understanding how PyTorch interacts with the GPU and why synchronization is crucial for timing.\n","# 2. **Memory Management:** Basics of GPU memory, host-device transfers, and the caching allocator.\n","# 3. **Benchmarking GPU Code:** Reliable methods using `torch.utils.benchmark`, including warmup.\n","# 4. **Performance Features:** Using CUDA Streams for overlap and CUDA Graphs for reducing launch overhead.\n","# 5. **Debugging Tips:** Locating errors in asynchronous GPU code.\n","#\n","# **Goal:** Equip you with practical knowledge to write faster and more memory-efficient PyTorch code on GPUs.\n","\n","# %% [markdown]\n","# ## Setup: Imports and Device Check\n","\n","# %%\n","import torch\n","import numpy as np\n","import time\n","from time import perf_counter\n","import torch.utils.benchmark as benchmark # Recommended tool\n","\n","# Check for GPU availability and set device\n","if torch.cuda.is_available():\n"," device = torch.device(\"cuda\")\n"," print(f\"CUDA device found: {torch.cuda.get_device_name(0)}\")\n"," props = torch.cuda.get_device_properties(0)\n"," print(f\" Compute Capability: {props.major}.{props.minor}\")\n"," print(f\" Total Memory: {props.total_memory / (1024**3):.2f} GB\")\n"," print(f\" Multiprocessor Count: {props.multi_processor_count}\")\n","else:\n"," device = torch.device(\"cpu\")\n"," print(\"CUDA device not found. Using CPU (GPU examples will be skipped or run on CPU).\")\n"," # Set a flag for conditional execution of GPU-specific cells\n"," use_gpu = False\n"," props = None # No properties available\n","\n","if device.type == 'cuda':\n"," use_gpu = True\n","\n","# %% [markdown]\n","# ## 1. Asynchronous Execution & Synchronization\n","#\n","# CUDA operations launched from Python (like kernel execution or memory copies) are typically **asynchronous**. The CPU queues the task on the GPU and immediately returns, *before* the task is finished. This allows CPU and GPU parallelism but complicates timing.\n","#\n","# **Problem:** Naively timing Python calls doesn't measure the actual GPU execution time.\n","#\n","# **Solution:** Use `torch.cuda.synchronize()` to make the CPU wait until all previously queued tasks on the GPU have completed.\n","\n","# %%\n","if use_gpu:\n"," size = 2048\n"," a = torch.randn(size, size, device=device)\n"," b = torch.randn(size, size, device=device)\n","\n"," # --- Incorrect Timing (No Synchronization) ---\n"," start_time = perf_counter()\n"," c = torch.matmul(a, b)\n"," end_time = perf_counter()\n"," print(f\"Incorrect MM time (measures launch overhead): {(end_time - start_time) * 1000:.4f} ms\")\n","\n"," # --- Correct Timing (With Synchronization) ---\n"," # Ensure GPU is idle before starting timer\n"," torch.cuda.synchronize()\n"," start_time = perf_counter()\n"," c = torch.matmul(a, b)\n"," # Wait for matmul kernel to finish *before* stopping timer\n"," torch.cuda.synchronize()\n"," end_time = perf_counter()\n"," correct_time_ms = (end_time - start_time) * 1000\n"," print(f\"Correct MM time (with sync): {correct_time_ms:.4f} ms\")\n","\n"," # Note: Copying data back to CPU (e.g., .cpu(), .item()) also implicitly synchronizes.\n"," start_time = perf_counter()\n"," loss = c.sum().item() # .item() forces sync\n"," end_time = perf_counter()\n"," print(f\"Timing with implicit sync via .item(): {(end_time - start_time) * 1000:.4f} ms\")\n"," print(f\"Result (sum): {loss}\")\n","\n","else:\n"," print(\"Skipping async execution demo (requires GPU).\")\n","\n","# %% [markdown]\n","# **Key Takeaway:** Always use `torch.cuda.synchronize()` when manually timing GPU operations with tools like `time.perf_counter`. Better yet, use benchmarking utilities that handle this automatically.\n","\n","# %% [markdown]\n","# ## 2. Memory Management\n","#\n","# GPUs have their own memory (device memory), separate from CPU RAM (host memory). Data must be explicitly transferred.\n","#\n","# - **H2D (Host-to-Device):** `tensor.to(device)` or `tensor.cuda()`\n","# - **D2H (Device-to-Host):** `tensor.cpu()`\n","#\n","# These transfers occur over the PCIe bus and can be bottlenecks.\n","#\n","# ### Caching Allocator\n","# PyTorch uses a **caching allocator** for GPU memory to speed up allocations. When a tensor goes out of scope, its memory isn't immediately freed back to the OS but is kept **reserved** by PyTorch for potential reuse.\n","#\n","# - `torch.cuda.memory_allocated()`: Memory currently used by active tensors.\n","# - `torch.cuda.memory_reserved()`: Total memory held by the caching allocator (allocated + cached/free).\n","\n","# %%\n","if use_gpu:\n"," print(f\"Initial - Allocated: {torch.cuda.memory_allocated() / (1024**2):.2f} MB, Reserved: {torch.cuda.memory_reserved() / (1024**2):.2f} MB\")\n","\n"," # Allocate tensor inside a function scope\n"," def allocate_temp_tensor(rows, cols):\n"," t = torch.empty(rows, cols, device=device)\n"," print(f\"Inside func - Allocated: {torch.cuda.memory_allocated() / (1024**2):.2f} MB, Reserved: {torch.cuda.memory_reserved() / (1024**2):.2f} MB\")\n"," # Tensor 't' goes out of scope here\n","\n"," allocate_temp_tensor(2048, 2048) # Approx 16 MB tensor\n"," print(f\"After func - Allocated: {torch.cuda.memory_allocated() / (1024**2):.2f} MB, Reserved: {torch.cuda.memory_reserved() / (1024**2):.2f} MB\")\n","\n"," # Allocate another tensor (smaller) - should reuse the reserved block\n"," allocate_temp_tensor(1024, 1024) # Approx 4 MB tensor\n"," print(f\"After smaller - Allocated: {torch.cuda.memory_allocated() / (1024**2):.2f} MB, Reserved: {torch.cuda.memory_reserved() / (1024**2):.2f} MB\")\n","\n"," # Allocate a larger tensor - may need a new block, increasing reserved memory\n"," allocate_temp_tensor(4096, 4096) # Approx 64 MB tensor\n"," print(f\"After larger - Allocated: {torch.cuda.memory_allocated() / (1024**2):.2f} MB, Reserved: {torch.cuda.memory_reserved() / (1024**2):.2f} MB\")\n","\n"," # Manually clear the cache (releases reserved memory back to OS)\n"," # WARNING: This causes CPU-GPU synchronization and can hurt performance. Avoid frequent use.\n"," torch.cuda.empty_cache()\n"," print(f\"After empty_cache - Allocated: {torch.cuda.memory_allocated() / (1024**2):.2f} MB, Reserved: {torch.cuda.memory_reserved() / (1024**2):.2f} MB\")\n","\n","else:\n"," print(\"Skipping memory management demo (requires GPU).\")\n","\n","# %% [markdown]\n","# **Memory Best Practices:**\n","# - Minimize H2D/D2H transfers.\n","# - Be mindful that reserved memory can grow if tensor sizes vary significantly. Pre-allocating for the largest expected size can sometimes help.\n","# - Avoid calling `torch.cuda.empty_cache()` frequently in performance-critical code. Better memory management is preferred.\n","# - Consider `pin_memory=True` for CPU tensors involved in frequent H2D transfers (e.g., in `DataLoader`) to potentially speed them up by avoiding an extra internal copy.\n","\n","# %% [markdown]\n","# ## 3. Benchmarking GPU Code Reliably\n","#\n","# Accurate benchmarking requires:\n","# 1. **Warmup:** Initial GPU operations can have extra overhead (context setup, kernel loading). Run the code a few times *before* measuring.\n","# 2. **Synchronization:** Ensure measurements capture the full GPU execution time.\n","# 3. **Averaging:** Run the operation multiple times and average the results for stability.\n","#\n","# `torch.utils.benchmark` is the recommended tool as it handles these aspects automatically.\n","\n","# %%\n","# Define functions to benchmark (from original notebook)\n","def batched_dot_mul_sum(a, b):\n"," \"\"\"Computes batched dot by multiplying and summing\"\"\"\n"," return a.mul(b).sum(-1)\n","\n","def batched_dot_bmm(a, b):\n"," \"\"\"Computes batched dot by reducing to bmm\"\"\"\n"," a = a.reshape(-1, 1, a.shape[-1])\n"," b = b.reshape(-1, b.shape[-1], 1)\n"," # Use .squeeze() instead of flatten for robustness if batch dim is 1\n"," return torch.bmm(a, b).squeeze(-1).squeeze(-1)\n","\n","if use_gpu:\n"," # Input data on GPU\n"," x_gpu = torch.randn(10000, 1024, device=device)\n","\n"," # Ensure correctness\n"," res1 = batched_dot_mul_sum(x_gpu, x_gpu)\n"," res2 = batched_dot_bmm(x_gpu, x_gpu)\n"," assert torch.allclose(res1, res2, atol=1e-5), \"Functions produce different results!\"\n","\n"," print(\"Benchmarking with torch.utils.benchmark (handles warmup & sync):\")\n","\n"," t0 = benchmark.Timer(\n"," stmt=\"batched_dot_mul_sum(x, x)\",\n"," globals={\"x\": x_gpu, \"batched_dot_mul_sum\": batched_dot_mul_sum}\n"," )\n","\n"," t1 = benchmark.Timer(\n"," stmt=\"batched_dot_bmm(x, x)\",\n"," globals={\"x\": x_gpu, \"batched_dot_bmm\": batched_dot_bmm}\n"," )\n","\n"," # timeit runs the statement multiple times and returns measurement object\n"," m0 = t0.timeit(100)\n"," m1 = t1.timeit(100)\n","\n"," print(f\"Method 1 (mul_sum):\\n{m0}\")\n"," print(f\"Method 2 (bmm):\\n{m1}\")\n","\n"," # Comparing results\n"," compare = benchmark.Compare([m0, m1])\n"," print(\"\\nComparison:\")\n"," compare.print()\n","\n"," # Quick check with IPython magic (also useful, handles basic timing)\n"," print(\"\\nBenchmarking with %timeit magic:\")\n"," %timeit batched_dot_mul_sum(x_gpu, x_gpu); torch.cuda.synchronize()\n"," %timeit batched_dot_bmm(x_gpu, x_gpu); torch.cuda.synchronize()\n","\n","else:\n"," print(\"Skipping benchmarking demo (requires GPU).\")\n","\n","# %% [markdown]\n","# **Benchmarking Takeaway:** Use `torch.utils.benchmark` for reliable microbenchmarks. It handles warmup, synchronization, and provides statistical summaries. `%timeit` is convenient for quick checks in notebooks, but remember to add `torch.cuda.synchronize()` manually.\n","\n","# %% [markdown]\n","# ## 4. Performance Features: Streams & Graphs\n","#\n","# ### CUDA Streams\n","# Streams allow **concurrent execution** of independent CUDA operations on the GPU, potentially overlapping computation with data transfers.\n","#\n","# **Use Case:** Overlap data loading/transfers (H2D) for the *next* batch with the computation of the *current* batch.\n","\n","# %%\n","if use_gpu and props and props.major >= 3: # Streams generally effective on Kepler+\n"," # Setup: Heavy compute task (MatMul) and data to transfer\n"," compute_stream = torch.cuda.Stream()\n"," h2d_stream = torch.cuda.Stream()\n","\n"," # Large matrix for compute-bound task\n"," A_gpu = torch.randn(8192, 8192, device=device)\n","\n"," # List of CPU tensors to simulate data loading pipeline\n"," # Use pinned memory for potentially faster H2D transfers\n"," cpu_data = [torch.randn(2048, 2048, device='cpu').pin_memory() for _ in range(10)]\n"," gpu_targets = [torch.empty_like(d, device=device) for d in cpu_data] # Pre-allocate GPU targets\n","\n"," # Ensure setup is complete\n"," torch.cuda.synchronize()\n","\n"," # --- Sequential Execution (Compute then Transfer) ---\n"," start_seq = perf_counter()\n"," comp_result = torch.matmul(A_gpu, A_gpu)\n"," # Ensure compute is done before starting transfers (implicit if not using streams)\n"," torch.cuda.synchronize()\n"," transfer_start = perf_counter()\n"," for i in range(len(cpu_data)):\n"," gpu_targets[i].copy_(cpu_data[i], non_blocking=False) # Blocking copy\n"," # Ensure transfers are done\n"," torch.cuda.synchronize()\n"," end_seq = perf_counter()\n"," print(f\"Sequential: Compute took ~{transfer_start - start_seq:.4f}s, Transfer took ~{end_seq - transfer_start:.4f}s, Total: {end_seq - start_seq:.4f}s\")\n","\n","\n"," # --- Streamed Execution (Overlap Compute and Transfer) ---\n"," start_stream = perf_counter()\n"," # Queue computation on the compute stream\n"," with torch.cuda.stream(compute_stream):\n"," comp_result_stream = torch.matmul(A_gpu, A_gpu)\n","\n"," # Queue transfers on the H2D stream\n"," with torch.cuda.stream(h2d_stream):\n"," for i in range(len(cpu_data)):\n"," # non_blocking=True is essential for overlap with other streams\n"," gpu_targets[i].copy_(cpu_data[i], non_blocking=True)\n","\n"," # Wait for *both* streams to complete\n"," compute_stream.synchronize()\n"," h2d_stream.synchronize()\n"," # OR simply torch.cuda.synchronize() waits for all streams on the device\n"," # torch.cuda.synchronize()\n"," end_stream = perf_counter()\n"," print(f\"Streamed (Overlap): Total time: {end_stream - start_stream:.4f}s\")\n","\n"," # Cleanup large tensor\n"," del A_gpu, comp_result, comp_result_stream\n"," torch.cuda.empty_cache()\n","\n","else:\n"," print(\"Skipping CUDA Streams demo (requires GPU, preferably Compute Capability >= 3.0).\")\n","\n","# %% [markdown]\n","# **Streams Takeaway:** Streams can hide data transfer latency behind computation *if* the tasks are independent and the GPU isn't already fully saturated by one task alone. Use `non_blocking=True` for copies intended to overlap.\n","\n","# %% [markdown]\n","# ### CUDA Graphs\n","# CUDA Graphs capture a sequence of GPU operations (kernel launches) and allow replaying them with very low CPU overhead.\n","#\n","# **Use Case:** Speeding up workloads dominated by **launching many small, fast kernels**, where the CPU overhead of launching each kernel becomes significant.\n","\n","# %%\n","if use_gpu:\n"," # Function with many small operations\n"," def func_many_small_ops(x):\n"," for _ in range(200): # Reduced iterations for faster demo\n"," x = torch.sigmoid(x * 1.01 + 0.01) + torch.relu(x - 0.5)\n"," return x\n","\n"," # Input for capture and replay\n"," static_input = torch.randn(1024, 1024, device=device)\n","\n"," # --- Standard Eager Execution ---\n"," print(\"Benchmarking Eager Execution:\")\n"," t_eager = benchmark.Timer(\n"," stmt=\"func_many_small_ops(x)\",\n"," globals={\"x\": static_input, \"func_many_small_ops\": func_many_small_ops}\n"," )\n"," m_eager = t_eager.timeit(50)\n"," print(m_eager)\n","\n","\n"," # --- CUDA Graph Execution ---\n"," # 1. Warmup (Important before graph capture)\n"," # Run the function once to ensure kernels are loaded, etc.\n"," _ = func_many_small_ops(static_input)\n"," torch.cuda.synchronize()\n","\n"," # 2. Capture\n"," graph = torch.cuda.CUDAGraph()\n"," # Create static tensors for capture inputs/outputs\n"," static_input_capture = static_input.clone()\n"," static_output_capture = torch.empty_like(static_input_capture)\n","\n"," with torch.cuda.graph(graph):\n"," # Operations inside this block are captured\n"," static_output_capture = func_many_small_ops(static_input_capture)\n","\n"," # 3. Replay\n"," # To run on new data, copy it into the original input tensor *before* replay\n"," new_data = torch.randn_like(static_input)\n"," static_input_capture.copy_(new_data)\n","\n"," print(\"\\nBenchmarking Graph Replay:\")\n"," t_graph = benchmark.Timer(\n"," stmt=\"graph.replay()\",\n"," # Setup includes copying new data before each replay measurement\n"," setup=\"static_input_capture.copy_(torch.randn_like(static_input))\",\n"," globals={\"graph\": graph, \"static_input_capture\": static_input_capture, \"torch\": torch, \"static_input\": static_input}\n"," )\n"," m_graph = t_graph.timeit(50)\n"," print(m_graph)\n","\n"," # Verify result (optional)\n"," # graph.replay() # Replay on the last copied data\n"," # eager_output = func_many_small_ops(static_input_capture.clone()) # Rerun eager on same data\n"," # print(f\"\\nGraph and Eager outputs close: {torch.allclose(static_output_capture, eager_output)}\")\n","\n","else:\n"," print(\"Skipping CUDA Graphs demo (requires GPU).\")\n","\n","# %% [markdown]\n","# **Graphs Takeaway:** CUDA Graphs significantly reduce CPU launch overhead for sequences of operations. Ideal for models or sections of code involving many small GPU tasks. Requires fixed operation sequence and careful handling of input/output tensors.\n","\n","# %% [markdown]\n","# ## 5. Debugging Asynchronous Errors\n","#\n","# Because GPU execution is asynchronous, errors (like out-of-bounds access) might not be reported until a later synchronizing operation (`.item()`, `.cpu()`, `torch.cuda.synchronize()`). The Python traceback might point to the wrong line.\n","#\n","# **Solution:** Set the environment variable `CUDA_LAUNCH_BLOCKING=1` when running your script. This forces every CUDA operation to run synchronously, making the traceback point to the exact line causing the GPU error.\n","#\n","# ```bash\n","# # Example command line execution\n","# CUDA_LAUNCH_BLOCKING=1 python your_script.py\n","# ```\n","#\n","# Let's simulate this (requires saving and running a separate file).\n","\n","# %%\n","# %%writefile gpu_error_example.py\n","import torch\n","import os\n","\n","# Use GPU if available, otherwise skip gracefully\n","if torch.cuda.is_available():\n"," device = torch.device(\"cuda\")\n"," print(f\"Running on {device}\")\n","\n"," try:\n"," embedding = torch.nn.Embedding(10, 4).to(device)\n"," # Error: Index 10 is out of bounds for embedding size 10 (valid indices 0-9)\n"," bad_input = torch.tensor([[1, 2], [3, 10]], dtype=torch.long, device=device)\n","\n"," print(\"Launching embedding lookup (potentially bad)...\")\n"," embedded_vals = embedding(bad_input) # Error occurs here async\n","\n"," print(\"Launching subsequent operation...\")\n"," result = torch.sigmoid(embedded_vals) # Another async op\n","\n"," print(\"Triggering synchronization via .sum().item()...\")\n"," loss = result.sum().item() # Error likely reported here in default mode\n"," print(f\"Loss (if successful): {loss}\")\n","\n"," except RuntimeError as e:\n"," print(\"\\n--- Caught RuntimeError ---\")\n"," print(e)\n"," # Check if the error message indicates a device-side assert\n"," if \"device-side assert triggered\" in str(e):\n"," print(\"\\nSuggestion: Rerun with 'CUDA_LAUNCH_BLOCKING=1 python gpu_error_example.py' for a more precise traceback.\")\n"," print(\"--- End of Error ---\")\n","\n","else:\n"," print(\"Skipping GPU error example (CUDA not available).\")\n","\n","print(\"Script finished.\")\n","\n","# %% [markdown]\n","# **Running the script normally (simulated output):**\n","# ```\n","# Running on cuda\n","# Launching embedding lookup (potentially bad)...\n","# Launching subsequent operation...\n","# Triggering synchronization via .sum().item()...\n","#\n","# --- Caught RuntimeError ---\n","# CUDA error: device-side assert triggered\n","# CUDA kernel errors might be asynchronously reported at some other API call, [...]\n","# Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","#\n","#\n","# Suggestion: Rerun with 'CUDA_LAUNCH_BLOCKING=1 python gpu_error_example.py' for a more precise traceback.\n","# --- End of Error ---\n","# Script finished.\n","# ```\n","# > Notice the error is caught near `.item()`, not the `embedding()` call.\n","#\n","# **Running with `CUDA_LAUNCH_BLOCKING=1` (simulated output):**\n","# ```\n","# Running on cuda\n","# Launching embedding lookup (potentially bad)...\n","#\n","# --- Caught RuntimeError ---\n","# CUDA error: device-side assert triggered\n","# Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","# --- End of Error ---\n","# Script finished.\n","# ```\n","# > With `CUDA_LAUNCH_BLOCKING=1`, the script would likely crash *during* the `embedding(bad_input)` call, giving a traceback pointing directly to that line. (Actual traceback varies).\n","\n","# %% [markdown]\n","# **Debugging Takeaway:** If you get generic CUDA errors, rerun with `CUDA_LAUNCH_BLOCKING=1` to pinpoint the faulty operation. Remember to turn it off for regular training/inference as it hurts performance.\n","\n","# %% [markdown]\n","# ## 6. Floating Point Precision\n","#\n","# Be aware that floating-point arithmetic is not always associative, and results can differ slightly between:\n","# - CPU vs. GPU execution\n","# - Different GPUs or CUDA versions\n","# - Different libraries (e.g., PyTorch vs. NumPy)\n","# - Even run-to-run on some hardware due to non-deterministic algorithms (though PyTorch tries to mitigate this).\n","#\n","# This is usually only problematic if your code relies on exact floating-point equality. For most DL tasks, small numerical differences are acceptable. `torch.allclose()` is useful for comparing tensors with tolerance.\n","\n","# %%\n","torch.manual_seed(1337)\n","x_cpu = torch.randn(1000, 1000, dtype=torch.float32)\n","x_gpu = x_cpu.to(device)\n","\n","def matrix_power_sum(x):\n"," # Use float64 for intermediate steps to reduce accumulation error magnitude for demo\n"," y = x.double() @ x.double() @ x.double() @ x.double() @ x.double() @ x.double()\n"," return y.sum().item()\n","\n","if use_gpu:\n"," # Disable deterministic algorithms for this demo if enabled globally\n"," deterministic_setting = torch.are_deterministic_algorithms_enabled()\n"," torch.use_deterministic_algorithms(False)\n","\n"," res_cpu = matrix_power_sum(x_cpu)\n"," res_gpu = matrix_power_sum(x_gpu)\n","\n"," print(f\"Result CPU: {res_cpu}\")\n"," print(f\"Result GPU: {res_gpu}\")\n"," print(f\"Difference: {abs(res_cpu - res_gpu)}\")\n"," print(f\"Relative Difference: {abs(res_cpu - res_gpu) / abs(res_cpu):.2e}\")\n","\n"," # Restore deterministic setting\n"," torch.use_deterministic_algorithms(deterministic_setting)\n","else:\n"," print(\"Skipping FP precision demo (requires GPU).\")\n","\n","\n","# %% [markdown]\n","# ## Conclusion & Key Takeaways\n","#\n","# Efficiently using GPUs in PyTorch involves understanding their asynchronous nature and memory hierarchy.\n","#\n","# - **Synchronization is Key:** Use `torch.cuda.synchronize()` for manual timing; rely on `torch.utils.benchmark` which handles it automatically.\n","# - **Warmup Before Benchmarking:** Initial runs can be slower.\n","# - **Manage Memory:** Minimize host-device transfers. Use pinned memory strategically for H2D. Avoid frequent `empty_cache()`.\n","# - **Leverage Advanced Features:** Use CUDA Streams to overlap computation and data transfer. Use CUDA Graphs to reduce launch overhead for small kernels.\n","# - **Debug Smart:** Use `CUDA_LAUNCH_BLOCKING=1` to locate the source of asynchronous GPU errors.\n","# - **Be Aware of Precision:** Expect small floating-point differences across devices/runs.\n","#\n","# Applying these principles will help you build faster and more efficient deep learning systems.\n","\n","# %%\n","print(\"End of tutorial.\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":883},"id":"whCpKigYwCMy","executionInfo":{"status":"error","timestamp":1745903402396,"user_tz":-420,"elapsed":126,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"494be443-fae5-4564-c9d8-2d2835328ab9"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["CUDA device found: NVIDIA L4\n"," Compute Capability: 8.9\n"," Total Memory: 22.16 GB\n"," Multiprocessor Count: 58\n","Incorrect MM time (measures launch overhead): 0.1014 ms\n","Correct MM time (with sync): 1.3654 ms\n","Timing with implicit sync via .item(): 0.1072 ms\n","Result (sum): -71633.71875\n","Initial - Allocated: 140.20 MB, Reserved: 178.00 MB\n","Inside func - Allocated: 156.20 MB, Reserved: 178.00 MB\n","After func - Allocated: 140.20 MB, Reserved: 178.00 MB\n","Inside func - Allocated: 144.20 MB, Reserved: 178.00 MB\n","After smaller - Allocated: 140.20 MB, Reserved: 178.00 MB\n","Inside func - Allocated: 204.20 MB, Reserved: 242.00 MB\n","After larger - Allocated: 140.20 MB, Reserved: 242.00 MB\n","After empty_cache - Allocated: 140.20 MB, Reserved: 156.00 MB\n","Benchmarking with torch.utils.benchmark (handles warmup & sync):\n","Method 1 (mul_sum):\n","\n","batched_dot_mul_sum(x, x)\n"," 364.62 us\n"," 1 measurement, 100 runs , 1 thread\n","Method 2 (bmm):\n","\n","batched_dot_bmm(x, x)\n"," 46.25 us\n"," 1 measurement, 100 runs , 1 thread\n","\n","Comparison:\n"]},{"output_type":"error","ename":"TypeError","evalue":"object of type 'NoneType' has no len()","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[0mcompare\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbenchmark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCompare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mm0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mm1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 201\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\nComparison:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 202\u001b[0;31m \u001b[0mcompare\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 203\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;31m# Quick check with IPython magic (also useful, handles basic timing)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/benchmark/utils/compare.py\u001b[0m in \u001b[0;36mprint\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0;34m\"\"\"Print formatted table\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 324\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 325\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 326\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_render\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/benchmark/utils/compare.py\u001b[0m in \u001b[0;36m__str__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__str__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 293\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_render\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 294\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextend_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/benchmark/utils/compare.py\u001b[0m in \u001b[0;36m_render\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcommon\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMeasurement\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_results\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0mgrouped_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_group_by_label\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_layout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mgroup\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgrouped_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/benchmark/utils/compare.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcommon\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMeasurement\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_results\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0mgrouped_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_group_by_label\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_layout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mgroup\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgrouped_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/benchmark/utils/compare.py\u001b[0m in \u001b[0;36m_layout\u001b[0;34m(self, results)\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_highlight_warnings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m )\n\u001b[0;32m--> 345\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrender\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/benchmark/utils/compare.py\u001b[0m in \u001b[0;36mrender\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0msr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"\"\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_cols\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 252\u001b[0;31m \u001b[0mcol_widths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mstring_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 253\u001b[0m \u001b[0mfinalized_columns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\" | \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcenter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring_rows\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_widths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[0moverall_width\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfinalized_columns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/benchmark/utils/compare.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0msr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"\"\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_cols\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 252\u001b[0;31m \u001b[0mcol_widths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mstring_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 253\u001b[0m \u001b[0mfinalized_columns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\" | \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcenter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring_rows\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_widths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[0moverall_width\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfinalized_columns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/benchmark/utils/compare.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0msr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"\"\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_cols\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 252\u001b[0;31m \u001b[0mcol_widths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mstring_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 253\u001b[0m \u001b[0mfinalized_columns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\" | \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcenter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring_rows\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_widths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[0moverall_width\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfinalized_columns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mTypeError\u001b[0m: object of type 'NoneType' has no len()"]}]},{"cell_type":"code","source":[],"metadata":{"id":"Fglbei4M8qD9"},"execution_count":null,"outputs":[]}]}