alpha31476 commited on May 13

Commit

87ef7b5

verified ·

1 Parent(s): 3f546f5

LDM-train-pass, checking results

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +102 -0
.gitignore +25 -0
.vscode/settings.json +3 -0
DDPM/CeleabA.parquet +3 -0
DDPM/_1_Mnist.ipynb +546 -0
DDPM/_3_Activation-Checkpointing-Sequential.ipynb +216 -0
DDPM/_4_Activation-Checkpointing-VAE.ipynb +444 -0
DDPM/_5_Activation-Ckpt-VAE-CelebA.ipynb +0 -0
Imgui/demo-newstyle.py +298 -0
Imgui/demo.py +301 -0
Imgui/imgui.ini +25 -0
LDM/notebooks/_1_Main.ipynb +1481 -0
LDM/notebooks/_2_Rough-LPIPS.ipynb +0 -0
LDM/scripts/Main.py +2273 -0
LDM/scripts/_1_Lpips.py +56 -0
LDM/scripts/config.yaml +65 -0
Vaani/39448.err +351 -0
Vaani/39448.out +11 -0
Vaani/IISc_VaaniProject_M_AP_Anantpur_00014520_1544240000_APATSR_190315_1880_16300.wav +3 -0
Vaani/LDM/__init__.py +0 -0
Vaani/LDM/notebooks/Vaani-subplot.png +3 -0
Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-15_16.png +3 -0
Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-30_16.png +3 -0
Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-4.png +3 -0
Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-5.png +3 -0
Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-6.png +3 -0
Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-6_16.png +3 -0
Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-8_16.png +3 -0
Vaani/LDM/notebooks/_1_Main.ipynb +0 -0
Vaani/LDM/notebooks/_2_Rough-LPIPS.ipynb +0 -0
Vaani/LDM/scripts/AE-training.log +126 -0
Vaani/LDM/scripts/Main.py +2303 -0
Vaani/LDM/scripts/SLURM-AE-Train.sh +21 -0
Vaani/LDM/scripts/SLURM-AE-Train2.sh +21 -0
Vaani/LDM/scripts/Vaani-VQVAE-Main.py +1151 -0
Vaani/LDM/scripts/VaaniLDM/vqvaq_ckpt-15.pth +3 -0
Vaani/LDM/scripts/VaaniLDM/vqvaq_ckpt.pth +3 -0
Vaani/LDM/scripts/_1_Lpips.py +56 -0
Vaani/LDM/scripts/__init__.py +0 -0
Vaani/LDM/scripts/config.yaml +65 -0
Vaani/LDM/scripts/dotdict.py +53 -0
Vaani/SLURM_test.sh +20 -0
Vaani/VQVAE_architecture.svg +0 -0
Vaani/VQVAE_summary.txt +438 -0
Vaani/VQVAE_training.sh +19 -0
Vaani/Vaani-Audio-Image-English.csv +0 -0
Vaani/Vaani-Images-Audio-MetaData.parquet +3 -0
Vaani/Vaani-subplot.png +3 -0
Vaani/VaaniLDM/ddpm_ckpt_epoch14.pt +3 -0
Vaani/VaaniLDM/ddpm_ckpt_epoch15.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,105 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Vaani/IISc_VaaniProject_M_AP_Anantpur_00014520_1544240000_APATSR_190315_1880_16300.wav filter=lfs diff=lfs merge=lfs -text
+Vaani/LDM/notebooks/Vaani-subplot.png filter=lfs diff=lfs merge=lfs -text
+Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-15_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-30_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-4.png filter=lfs diff=lfs merge=lfs -text
+Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-5.png filter=lfs diff=lfs merge=lfs -text
+Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-6.png filter=lfs diff=lfs merge=lfs -text
+Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-6_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-8_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/Vaani-subplot.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/samples/x0_0.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-10_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-11_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-12_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-13_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-14_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-15_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-16_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-17_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-18_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-19_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-20_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-21_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-22_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-23_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-24_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-25_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-26_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-27_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-28_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-29_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-31_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-32_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-33_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-34_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-36_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-37_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-38_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-39_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-40_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-41_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-42_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-43_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-44_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-45_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-46_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-47_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-48_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-49_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-51_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-52_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-53_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-54_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-55_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-56_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-57_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-58_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-59_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-60_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-61_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-62_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-63_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-64_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-65_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-66_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-67_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-68_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-69_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-71_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-73_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-74_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-75_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-76_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-77_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-78_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-79_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-81_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-82_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-83_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-84_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-85_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-86_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-87_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-88_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-89_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/vqvae_recon/reconstructed_images_EP-9_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM_Acc/vqvae_recon/reconstructed_images_EP-0_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM_Acc/vqvae_recon/reconstructed_images_EP-1_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM_Acc/vqvae_recon/reconstructed_images_EP-2_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM_Acc/vqvae_recon/reconstructed_images_EP-3_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM_Acc/vqvae_recon/reconstructed_images_EP-4_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM_Acc/vqvae_recon/reconstructed_images_EP-5_16.png filter=lfs diff=lfs merge=lfs -text
+Vaani/_1_data.ipynb filter=lfs diff=lfs merge=lfs -text
+Vaani/audio_urls.txt filter=lfs diff=lfs merge=lfs -text
+Vaani/finalMETA.csv filter=lfs diff=lfs merge=lfs -text
+Vaani/image_metadata_summary.csv filter=lfs diff=lfs merge=lfs -text
+Vaani/images_urls.txt filter=lfs diff=lfs merge=lfs -text
+Vaani/output_image.png filter=lfs diff=lfs merge=lfs -text
+Vaani/output_image2.png filter=lfs diff=lfs merge=lfs -text
+Vaani/sampleJSON.csv filter=lfs diff=lfs merge=lfs -text
+Vaani/sampleJSON.json filter=lfs diff=lfs merge=lfs -text
+tools/__pycache__/pynvml.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,25 @@

+# # Ignore image files
+# *.jpg
+# *.jpeg
+# *.png
+# *.gif
+# *.bmp
+# *.tiff
+# *.webp
+# *.svg
+# # Ignore specified data files
+# *.pth
+# *.pt
+# *.safetensors
+# *.npz
+# *.npy
+# *.csv
+# *.parquet
+# *.json
+# *.err
+# *.out
+# Vaani/audio_urls.txt
+# Vaani/images_urls.txt

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "auto-scroll.enabled": false
+}

DDPM/CeleabA.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f41418ec864a1ceee3e4f3c4863f758b534cf434f848c64a4d1df976d10f241
+size 3396938

DDPM/_1_Mnist.ipynb ADDED Viewed

	@@ -0,0 +1,546 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import torch.utils.checkpoint as checkpoint\n",
+    "from torchvision import datasets, transforms\n",
+    "from torch.utils.data import DataLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import nvidia_smi\n",
+    "import prettytable as pt\n",
+    "\n",
+    "def gputil_decorator(func):\n",
+    "    def wrapper(*args, **kwargs):\n",
+    "        import nvidia_smi\n",
+    "        import prettytable as pt\n",
+    "\n",
+    "        try:\n",
+    "            table = pt.PrettyTable(['Devices','Mem Free','GPU-util','GPU-mem'])\n",
+    "            nvidia_smi.nvmlInit()\n",
+    "            deviceCount = nvidia_smi.nvmlDeviceGetCount()\n",
+    "            for i in range(deviceCount):\n",
+    "                handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)\n",
+    "                res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)\n",
+    "                mem = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)\n",
+    "                table.add_row([i, f\"{mem.free/1024**2:5.2f}MB/{mem.total/1024**2:5.2f}MB\", f\"{res.gpu:3.1%}\", f\"{res.memory:3.1%}\"])\n",
+    "\n",
+    "        except nvidia_smi.NVMLError as error:\n",
+    "            print(error)\n",
+    "\n",
+    "        print(table)\n",
+    "        return func(*args, **kwargs)\n",
+    "    return wrapper\n",
+    "\n",
+    "def gputil_decorator2(func):\n",
+    "    def wrapper(*args, **kwargs):\n",
+    "        try:\n",
+    "            table = pt.PrettyTable(['Devices', 'Mem Free', 'GPU-util', 'GPU-mem'])\n",
+    "            nvidia_smi.nvmlInit()\n",
+    "            device_count = nvidia_smi.nvmlDeviceGetCount()\n",
+    "            for i in range(device_count):\n",
+    "                handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)\n",
+    "                res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)\n",
+    "                mem = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)\n",
+    "                table.add_row([\n",
+    "                    i,\n",
+    "                    f\"{mem.free / 1024 ** 2:5.2f}MB/{mem.total / 1024 ** 2:5.2f}MB\",\n",
+    "                    f\"{res.gpu:3.1%}\",\n",
+    "                    f\"{res.memory:3.1%}\"\n",
+    "                ])\n",
+    "            nvidia_smi.nvmlShutdown()\n",
+    "        except nvidia_smi.NVMLError as error:\n",
+    "            print(f\"Error fetching GPU stats: {error}\")\n",
+    "        print(table)\n",
+    "        return func(*args, **kwargs)\n",
+    "    return wrapper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "from torchvision import datasets, transforms\n",
+    "from torch.utils.data import DataLoader\n",
+    "import torchvision.models as models\n",
+    "import threading\n",
+    "import time\n",
+    "import nvidia_smi\n",
+    "import prettytable as pt\n",
+    "import os\n",
+    "\n",
+    "# GPU stats decorator\n",
+    "def gputil_decorator2(func):\n",
+    "    def wrapper(*args, **kwargs):\n",
+    "        try:\n",
+    "            table = pt.PrettyTable(['Devices', 'Mem Free', 'GPU-util', 'GPU-mem'])\n",
+    "            nvidia_smi.nvmlInit()\n",
+    "            device_count = nvidia_smi.nvmlDeviceGetCount()\n",
+    "            for i in range(device_count):\n",
+    "                handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)\n",
+    "                res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)\n",
+    "                mem = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)\n",
+    "                table.add_row([\n",
+    "                    i,\n",
+    "                    f\"{mem.free / 1024 ** 2:5.2f}MB/{mem.total / 1024 ** 2:5.2f}MB\",\n",
+    "                    f\"{res.gpu:3.1%}\",\n",
+    "                    f\"{res.memory:3.1%}\"\n",
+    "                ])\n",
+    "            nvidia_smi.nvmlShutdown()\n",
+    "        except nvidia_smi.NVMLError as error:\n",
+    "            print(f\"Error fetching GPU stats: {error}\")\n",
+    "        print(table)\n",
+    "        return func(*args, **kwargs)\n",
+    "    return wrapper\n",
+    "\n",
+    "# Function to print GPU stats every second\n",
+    "def print_gpu_stats(epoch_info):\n",
+    "    while not stop_event.is_set():\n",
+    "        os.system('cls' if os.name == 'nt' else 'clear')  # Clear the terminal\n",
+    "        gputil_decorator2(lambda: None)()  # Call the decorator to print stats\n",
+    "        print(epoch_info)  # Print epoch information\n",
+    "        time.sleep(1)  # Wait for 1 second\n",
+    "\n",
+    "# Define the model\n",
+    "class EfficientNetCIFAR10(nn.Module):\n",
+    "    def __init__(self, num_classes=10):\n",
+    "        super(EfficientNetCIFAR10, self).__init__()\n",
+    "        self.efficientnet = models.efficientnet_v2_l(weights=models.EfficientNet_V2_L_Weights.IMAGENET1K_V1)\n",
+    "        self.efficientnet.classifier[1] = nn.Linear(self.efficientnet.classifier[1].in_features, num_classes)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.efficientnet(x)\n",
+    "\n",
+    "# Load CIFAR-10 dataset\n",
+    "transform = transforms.Compose([\n",
+    "    transforms.ToTensor(),\n",
+    "    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
+    "])\n",
+    "\n",
+    "train_dataset = datasets.CIFAR10(root='/home/23m1521/datasets', train=True, download=True, transform=transform)\n",
+    "train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=64)\n",
+    "\n",
+    "test_dataset = datasets.CIFAR10(root='/home/23m1521/datasets', train=False, download=True, transform=transform)\n",
+    "test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=64)\n",
+    "\n",
+    "# Initialize model, loss function, and optimizer\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model = EfficientNetCIFAR10(num_classes=10).to(device)\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
+    "\n",
+    "# Training loop\n",
+    "def train(model, train_loader, criterion, optimizer, device):\n",
+    "    model.train()\n",
+    "    running_loss = 0.0\n",
+    "    for inputs, labels in train_loader:\n",
+    "        inputs, labels = inputs.to(device), labels.to(device)\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        outputs = model(inputs)\n",
+    "        loss = criterion(outputs, labels)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        running_loss += loss.item()\n",
+    "    return running_loss / len(train_loader)\n",
+    "\n",
+    "# Testing loop\n",
+    "def test(model, test_loader, criterion, device):\n",
+    "    model.eval()\n",
+    "    correct = 0\n",
+    "    total = 0\n",
+    "    with torch.no_grad():\n",
+    "        for inputs, labels in test_loader:\n",
+    "            inputs, labels = inputs.to(device), labels.to(device)\n",
+    "            outputs = model(inputs)\n",
+    "            _, predicted = torch.max(outputs.data, 1)\n",
+    "            total += labels.size(0)\n",
+    "            correct += (predicted == labels).sum().item()\n",
+    "    return correct / total\n",
+    "\n",
+    "# Start the GPU stats printing thread\n",
+    "stop_event = threading.Event()\n",
+    "epoch_info = \"\"  # Placeholder for epoch information\n",
+    "gpu_stats_thread = threading.Thread(target=print_gpu_stats, args=(epoch_info,))\n",
+    "gpu_stats_thread.start()\n",
+    "\n",
+    "# Train and test the model\n",
+    "num_epochs = 5\n",
+    "for epoch in range(num_epochs):\n",
+    "    train_loss = train(model, train_loader, criterion, optimizer, device)\n",
+    "    test_acc = test(model, test_loader, criterion, device)\n",
+    "    epoch_info = f\"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Test Accuracy: {test_acc:.4f}\"\n",
+    "    print_gpu_stats(epoch_info)  # Print epoch information\n",
+    "\n",
+    "# Stop the GPU stats printing thread\n",
+    "stop_event.set()\n",
+    "gpu_stats_thread.join()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files already downloaded and verified\n",
+      "Files already downloaded and verified\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define a simple CNN model\n",
+    "class SimpleCNN(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(SimpleCNN, self).__init__()\n",
+    "        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)\n",
+    "        self.relu1 = nn.ReLU()\n",
+    "        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)\n",
+    "        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)\n",
+    "        self.relu2 = nn.ReLU()\n",
+    "        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)\n",
+    "        self.fc1 = nn.Linear(32 * 8 * 8, 256)\n",
+    "        self.relu3 = nn.ReLU()\n",
+    "        self.fc2 = nn.Linear(256, 10)  # CIFAR-10 has 10 classes\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # Apply gradient/activation checkpointing to the second convolutional block\n",
+    "        x = self.conv1(x)\n",
+    "        x = self.relu1(x)\n",
+    "        x = self.pool1(x)\n",
+    "        x = checkpoint.checkpoint(self._conv2_block, x)  # Checkpointing here\n",
+    "        x = x.view(x.size(0), -1)  # Flatten\n",
+    "        x = self.fc1(x)\n",
+    "        x = self.relu3(x)\n",
+    "        x = self.fc2(x)\n",
+    "        return x\n",
+    "\n",
+    "    def _conv2_block(self, x):\n",
+    "        # Helper function for the second convolutional block\n",
+    "        x = self.conv2(x)\n",
+    "        x = self.relu2(x)\n",
+    "        x = self.pool2(x)\n",
+    "        return x\n",
+    "\n",
+    "# Load CIFAR-10 dataset\n",
+    "transform = transforms.Compose([\n",
+    "    transforms.ToTensor(),\n",
+    "    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
+    "])\n",
+    "\n",
+    "train_dataset = datasets.CIFAR10(root='/home/23m1521/datasets', train=True, download=True, transform=transform)\n",
+    "train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
+    "\n",
+    "test_dataset = datasets.CIFAR10(root='/home/23m1521/datasets', train=False, download=True, transform=transform)\n",
+    "test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)\n",
+    "\n",
+    "# Initialize model, loss function, and optimizer\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model = SimpleCNN().to(device)\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "optimizer = optim.Adam(model.parameters(), lr=0.001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------+-----------------------+----------+---------+\n",
+      "| Devices |        Mem Free       | GPU-util | GPU-mem |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "|    0    | 23416.75MB/24564.00MB |   0.0%   |   0.0%  |\n",
+      "|    1    |  944.75MB/24564.00MB  |   0.0%   |   0.0%  |\n",
+      "+---------+-----------------------+----------+---------+\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:600: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+      "  return fn(*args, **kwargs)\n",
+      "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+      "  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]\n",
+      "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/site-packages/torch/utils/checkpoint.py:92: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch [1/5], Loss: 1.3807, Test Accuracy: 0.5572\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "| Devices |        Mem Free       | GPU-util | GPU-mem |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "|    0    | 22732.75MB/24564.00MB |  300.0%  |  100.0% |\n",
+      "|    1    |  944.75MB/24564.00MB  |   0.0%   |   0.0%  |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "Epoch [2/5], Loss: 1.0334, Test Accuracy: 0.6553\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "| Devices |        Mem Free       | GPU-util | GPU-mem |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "|    0    | 22732.75MB/24564.00MB |  300.0%  |  100.0% |\n",
+      "|    1    |  944.75MB/24564.00MB  |   0.0%   |   0.0%  |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "Epoch [3/5], Loss: 0.8787, Test Accuracy: 0.6824\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "| Devices |        Mem Free       | GPU-util | GPU-mem |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "|    0    | 22732.75MB/24564.00MB |  200.0%  |  100.0% |\n",
+      "|    1    |  944.75MB/24564.00MB  |   0.0%   |   0.0%  |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "Epoch [4/5], Loss: 0.7545, Test Accuracy: 0.6885\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "| Devices |        Mem Free       | GPU-util | GPU-mem |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "|    0    | 22732.75MB/24564.00MB |  300.0%  |  100.0% |\n",
+      "|    1    |  944.75MB/24564.00MB  |   0.0%   |   0.0%  |\n",
+      "+---------+-----------------------+----------+---------+\n",
+      "Epoch [5/5], Loss: 0.6537, Test Accuracy: 0.6989\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Training loop\n",
+    "@gputil_decorator2\n",
+    "def train(model, train_loader, criterion, optimizer, device):\n",
+    "    model.train()\n",
+    "    running_loss = 0.0\n",
+    "    for inputs, labels in train_loader:\n",
+    "        inputs, labels = inputs.to(device), labels.to(device)\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        outputs = model(inputs)\n",
+    "        loss = criterion(outputs, labels)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        running_loss += loss.item()\n",
+    "    return running_loss / len(train_loader)\n",
+    "\n",
+    "# Testing loop\n",
+    "def test(model, test_loader, criterion, device):\n",
+    "    model.eval()\n",
+    "    correct = 0\n",
+    "    total = 0\n",
+    "    with torch.no_grad():\n",
+    "        for inputs, labels in test_loader:\n",
+    "            inputs, labels = inputs.to(device), labels.to(device)\n",
+    "            outputs = model(inputs)\n",
+    "            _, predicted = torch.max(outputs.data, 1)\n",
+    "            total += labels.size(0)\n",
+    "            correct += (predicted == labels).sum().item()\n",
+    "    return correct / total\n",
+    "\n",
+    "# Train and test the model\n",
+    "num_epochs = 5\n",
+    "for epoch in range(num_epochs):\n",
+    "    train_loss = train(model, train_loader, criterion, optimizer, device)\n",
+    "    test_acc = test(model, test_loader, criterion, device)\n",
+    "    print(f\"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Test Accuracy: {test_acc:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "([0.023805618286132812, 0.0],\n",
+       " [0.04064750671386719, 0.0],\n",
+       " [23.679443359375, 23.679443359375])"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def get_gpu_memory_usage():\n",
+    "    allocated_memory = []\n",
+    "    free_memory = []\n",
+    "    total_memory = []\n",
+    "    if torch.cuda.is_available():\n",
+    "        for i in range(torch.cuda.device_count()):\n",
+    "            device = torch.device(f\"cuda:{i}\")\n",
+    "            total = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)  # GB\n",
+    "            allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)  # GB\n",
+    "            reserved = torch.cuda.memory_reserved(device) / (1024 ** 3)  # GB\n",
+    "            free = reserved - allocated\n",
+    "            total_memory.append(total)\n",
+    "            allocated_memory.append(allocated)\n",
+    "            free_memory.append(free)\n",
+    "    return allocated_memory, free_memory, total_memory\n",
+    "get_gpu_memory_usage()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files already downloaded and verified\n",
+      "Files already downloaded and verified\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: \"https://download.pytorch.org/models/efficientnet_v2_l-59c71312.pth\" to /home/23m1521/.cache/torch/hub/checkpoints/efficientnet_v2_l-59c71312.pth\n",
+      "100%|██████████| 455M/455M [00:04<00:00, 117MB/s] \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch [1/5], Loss: 1.0192, Test Accuracy: 0.8080\n",
+      "Epoch [2/5], Loss: 0.4376, Test Accuracy: 0.8487\n",
+      "Epoch [3/5], Loss: 0.2590, Test Accuracy: 0.8334\n",
+      "Epoch [4/5], Loss: 0.1696, Test Accuracy: 0.8626\n",
+      "Epoch [5/5], Loss: 0.1257, Test Accuracy: 0.8621\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import torch.utils.checkpoint as checkpoint\n",
+    "from torchvision import datasets, transforms\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torchvision.models as models\n",
+    "\n",
+    "class EfficientNetCIFAR10(nn.Module):\n",
+    "    def __init__(self, num_classes=10):\n",
+    "        super(EfficientNetCIFAR10, self).__init__()\n",
+    "        \n",
+    "        # Load a pre-trained EfficientNet model\n",
+    "        self.efficientnet = models.efficientnet_v2_l(weights=models.EfficientNet_V2_L_Weights.IMAGENET1K_V1)\n",
+    "        \n",
+    "        # Modify the classifier head for CIFAR-10 (10 classes)\n",
+    "        self.efficientnet.classifier[1] = nn.Linear(self.efficientnet.classifier[1].in_features, num_classes)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.efficientnet(x)\n",
+    "\n",
+    "\n",
+    "# Load CIFAR-10 dataset\n",
+    "transform = transforms.Compose([\n",
+    "    transforms.ToTensor(),\n",
+    "    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
+    "])\n",
+    "\n",
+    "train_dataset = datasets.CIFAR10(root='/home/23m1521/datasets', train=True, download=True, transform=transform)\n",
+    "train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=64)\n",
+    "\n",
+    "test_dataset = datasets.CIFAR10(root='/home/23m1521/datasets', train=False, download=True, transform=transform)\n",
+    "test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=64)\n",
+    "\n",
+    "# Initialize model, loss function, and optimizer\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model = EfficientNetCIFAR10(num_classes=10).to(device)\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
+    "\n",
+    "# Training loop\n",
+    "def train(model, train_loader, criterion, optimizer, device):\n",
+    "    model.train()\n",
+    "    running_loss = 0.0\n",
+    "    for inputs, labels in train_loader:\n",
+    "        inputs, labels = inputs.to(device), labels.to(device)\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        outputs = model(inputs)\n",
+    "        loss = criterion(outputs, labels)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        running_loss += loss.item()\n",
+    "    return running_loss / len(train_loader)\n",
+    "\n",
+    "# Testing loop\n",
+    "def test(model, test_loader, criterion, device):\n",
+    "    model.eval()\n",
+    "    correct = 0\n",
+    "    total = 0\n",
+    "    with torch.no_grad():\n",
+    "        for inputs, labels in test_loader:\n",
+    "            inputs, labels = inputs.to(device), labels.to(device)\n",
+    "            outputs = model(inputs)\n",
+    "            _, predicted = torch.max(outputs.data, 1)\n",
+    "            total += labels.size(0)\n",
+    "            correct += (predicted == labels).sum().item()\n",
+    "    return correct / total\n",
+    "\n",
+    "# Train and test the model\n",
+    "num_epochs = 5\n",
+    "for epoch in range(num_epochs):\n",
+    "    train_loss = train(model, train_loader, criterion, optimizer, device)\n",
+    "    test_acc = test(model, test_loader, criterion, device)\n",
+    "    print(f\"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Test Accuracy: {test_acc:.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cuda_env2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

DDPM/_3_Activation-Checkpointing-Sequential.ipynb ADDED Viewed

	@@ -0,0 +1,216 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "CqFGp-OjP0_G"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "from torch.autograd import Variable"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "to7suvjJQJAM"
+      },
+      "source": [
+        "# [1] Checkpointing sequential models"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "1YmlCf4MQEXV",
+        "outputId": "03833d29-11aa-4def-a9e4-650e349201a3"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[Linear(in_features=100, out_features=50, bias=True),\n",
+              " ReLU(),\n",
+              " Linear(in_features=50, out_features=20, bias=True),\n",
+              " ReLU(),\n",
+              " Linear(in_features=20, out_features=5, bias=True),\n",
+              " ReLU()]"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from torch.utils.checkpoint import checkpoint_sequential\n",
+        "import torch.nn as nn\n",
+        "\n",
+        "model = nn.Sequential(\n",
+        "    nn.Linear(100, 50),\n",
+        "    nn.ReLU(),\n",
+        "    nn.Linear(50, 20),\n",
+        "    nn.ReLU(),\n",
+        "    nn.Linear(20, 5),\n",
+        "    nn.ReLU()\n",
+        ")\n",
+        "\n",
+        "input_var = Variable(torch.randn(1, 100), requires_grad=True)\n",
+        "segments = 2\n",
+        "\n",
+        "modules = [module for k, module in model._modules.items()]\n",
+        "modules"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "aHSqU-keQaPe",
+        "outputId": "7ebc66fb-99ab-4d22-fa39-5710fb7ca2cd"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "tensor([[0.0000, 0.3800, 0.0000, 0.0000, 0.0000]], grad_fn=<ReluBackward0>)"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "out = checkpoint_sequential(modules, segments, input_var, use_reentrant=False)\n",
+        "out"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "Q94h7De4RBGA"
+      },
+      "outputs": [],
+      "source": [
+        "# run the backwards pass on the model. For backwards pass, for simplicity purpose,\n",
+        "# we won't calculate the loss and rather backprop on out.sum()\n",
+        "model.zero_grad()\n",
+        "out.sum().backward()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "id": "LgNWA7fyRGAk"
+      },
+      "outputs": [],
+      "source": [
+        "# now we save the output and parameter gradients that we will use for comparison purposes with\n",
+        "# the non-checkpointed run.\n",
+        "output_checkpointed = out.data.clone()\n",
+        "grad_checkpointed = {}\n",
+        "for name, param in model.named_parameters():\n",
+        "    grad_checkpointed[name] = param.grad.data.clone()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qkdJd-B3RRWh"
+      },
+      "source": [
+        "Now that we have executed the checkpointed pass on the model, let's also run the non-checkpointed model and verify that the checkpoint API doesn't change the model outputs or the parameter gradients."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "id": "Ts5GQzxkRVrU"
+      },
+      "outputs": [],
+      "source": [
+        "# non-checkpointed run of the model\n",
+        "original = model\n",
+        "\n",
+        "# create a new variable using the same tensor data\n",
+        "x = Variable(input_var.data, requires_grad=True)\n",
+        "\n",
+        "# get the model output and save it to prevent any modifications\n",
+        "out = original(x)\n",
+        "out_not_checkpointed = out.data.clone()\n",
+        "\n",
+        "# calculate the gradient now and save the parameter gradients values\n",
+        "original.zero_grad()\n",
+        "out.sum().backward()\n",
+        "grad_not_checkpointed = {}\n",
+        "for name, param in model.named_parameters():\n",
+        "    grad_not_checkpointed[name] = param.grad.data.clone()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YiV1VBzyRX2Y"
+      },
+      "source": [
+        "Now that we have done the checkpointed and non-checkpointed pass of the model and saved the output and parameter gradients, let's compare their values"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "v9Tj9o8VRYq2",
+        "outputId": "bd8a8100-d660-4858-eb48-4a85aca01c69"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Checkpointed and non-checkpointed results match!\n"
+          ]
+        }
+      ],
+      "source": [
+        "try:\n",
+        "    assert torch.equal(output_checkpointed, out_not_checkpointed), \"Outputs do not match!\"\n",
+        "    for name in grad_checkpointed:\n",
+        "        assert torch.equal(grad_checkpointed[name], grad_not_checkpointed[name]), f\"Gradients for {name} do not match!\"\n",
+        "    print(\"Checkpointed and non-checkpointed results match!\")\n",
+        "except AssertionError as e:\n",
+        "    print(f\"Assertion failed: {e}\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

DDPM/_4_Activation-Checkpointing-VAE.ipynb ADDED Viewed

	@@ -0,0 +1,444 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "gpu_ram_utilization_bytes = torch.cuda.memory_allocated()\n",
+        "gpu_ram_utilization_mb = gpu_ram_utilization_bytes / (1024 * 1024)\n",
+        "gpu_ram_total_bytes = torch.cuda.get_device_properties(0).total_memory\n",
+        "gpu_ram_percentage = (gpu_ram_utilization_bytes / gpu_ram_total_bytes) * 100"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "gpu_ram_utilization_mb, gpu_ram_total_bytes"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ellNFnP7f2Wx",
+        "outputId": "3adb85e1-f41a-433f-bd77-f1301abb7731"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
+        "\n",
+        "import psutil\n",
+        "import torch\n",
+        "from datetime import datetime\n",
+        "import time\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.optim as optim\n",
+        "from torch.utils.data import DataLoader\n",
+        "from torchvision import datasets, transforms\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "\n",
+        "\n",
+        "timestamps = []\n",
+        "cpu_ram_mb = []\n",
+        "cpu_ram_percent = []\n",
+        "gpu_ram_mb = []\n",
+        "gpu_ram_percent = []\n",
+        "\n",
+        "\n",
+        "\n",
+        "# --- System Utilization ---------------------------------------------------------------------------\n",
+        "def get_system_utilization():\n",
+        "    current_time = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
+        "\n",
+        "    cpu_ram = psutil.virtual_memory()\n",
+        "    cpu_ram_utilization_bytes = cpu_ram.used\n",
+        "    cpu_ram_utilization_mb = cpu_ram_utilization_bytes / (1024 * 1024)\n",
+        "    cpu_ram_percentage = cpu_ram.percent\n",
+        "\n",
+        "    gpu_ram_utilization_mb = None\n",
+        "    gpu_ram_percentage = None\n",
+        "    if torch.cuda.is_available():\n",
+        "        gpu_ram_utilization_bytes = torch.cuda.memory_allocated()\n",
+        "        gpu_ram_utilization_mb = gpu_ram_utilization_bytes / (1024 * 1024)\n",
+        "        gpu_ram_total_bytes = torch.cuda.get_device_properties(0).total_memory\n",
+        "        gpu_ram_percentage = (gpu_ram_utilization_bytes / gpu_ram_total_bytes) * 100\n",
+        "\n",
+        "    return {\n",
+        "        \"time\": current_time,\n",
+        "        \"cpu_ram_utilization_mb\": cpu_ram_utilization_mb,\n",
+        "        \"cpu_ram_percentage\": cpu_ram_percentage,\n",
+        "        \"gpu_ram_utilization_mb\": gpu_ram_utilization_mb,\n",
+        "        \"gpu_ram_percentage\": gpu_ram_percentage\n",
+        "    }\n",
+        "\n",
+        "\n",
+        "\n",
+        "def update_utilization_lists():\n",
+        "    global timestamps, cpu_ram_mb, cpu_ram_percent, gpu_ram_mb, gpu_ram_percent\n",
+        "\n",
+        "    utilization = get_system_utilization()\n",
+        "\n",
+        "    timestamps.append(utilization[\"time\"])\n",
+        "    cpu_ram_mb.append(utilization[\"cpu_ram_utilization_mb\"])\n",
+        "    cpu_ram_percent.append(utilization[\"cpu_ram_percentage\"])\n",
+        "    gpu_ram_mb.append(utilization[\"gpu_ram_utilization_mb\"])\n",
+        "    gpu_ram_percent.append(utilization[\"gpu_ram_percentage\"])\n",
+        "\n",
+        "\n",
+        "\n",
+        "# --- Define the VAE model -------------------------------------------------------------------------\n",
+        "class VAE(nn.Module):\n",
+        "    update_utilization_lists()\n",
+        "    def __init__(self, latent_dim=20):\n",
+        "        super(VAE, self).__init__()\n",
+        "        self.latent_dim = latent_dim\n",
+        "\n",
+        "        # Encoder\n",
+        "        update_utilization_lists()\n",
+        "        self.encoder = nn.Sequential(\n",
+        "            nn.Linear(28 * 28, 512),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Linear(512, 256),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Linear(256, 2 * latent_dim)  # Output mean and log variance\n",
+        "        )\n",
+        "\n",
+        "        # Decoder\n",
+        "        update_utilization_lists()\n",
+        "        self.decoder = nn.Sequential(\n",
+        "            nn.Linear(latent_dim, 256),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Linear(256, 512),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Linear(512, 28 * 28),\n",
+        "            nn.Sigmoid()\n",
+        "        )\n",
+        "\n",
+        "    def encode(self, x):\n",
+        "        update_utilization_lists()\n",
+        "        h = self.encoder(x)\n",
+        "\n",
+        "        update_utilization_lists()\n",
+        "        mu, logvar = h.chunk(2, dim=-1)  # Split into mean and log variance\n",
+        "\n",
+        "        update_utilization_lists()\n",
+        "        return mu, logvar\n",
+        "\n",
+        "    def reparameterize(self, mu, logvar):\n",
+        "        update_utilization_lists()\n",
+        "        std = torch.exp(0.5 * logvar)\n",
+        "\n",
+        "        update_utilization_lists()\n",
+        "        eps = torch.randn_like(std)\n",
+        "\n",
+        "        update_utilization_lists()\n",
+        "        return mu + eps * std\n",
+        "\n",
+        "    def decode(self, z):\n",
+        "        update_utilization_lists()\n",
+        "        decoded = self.decoder(z)\n",
+        "\n",
+        "        update_utilization_lists()\n",
+        "        return decoded\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        update_utilization_lists()\n",
+        "        mu, logvar = self.encode(x.view(-1, 28 * 28))\n",
+        "\n",
+        "        z = self.reparameterize(mu, logvar)\n",
+        "        return self.decode(z), mu, logvar\n",
+        "\n",
+        "\n",
+        "\n",
+        "# --- Loss function --------------------------------------------------------------------------------\n",
+        "def loss_function(recon_x, x, mu, logvar):\n",
+        "    update_utilization_lists()\n",
+        "    BCE = F.binary_cross_entropy(recon_x, x.view(-1, 28 * 28), reduction='sum')\n",
+        "    \n",
+        "    update_utilization_lists()\n",
+        "    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())\n",
+        "    \n",
+        "    update_utilization_lists()\n",
+        "    return BCE + KLD\n",
+        "\n",
+        "\n",
+        "\n",
+        "# --- Load MNIST dataset ---------------------------------------------------------------------------\n",
+        "transform = transforms.Compose([transforms.ToTensor()])\n",
+        "train_dataset = datasets.MNIST(root='/home/23m1521/datasets/MNIST', train=True, download=True, transform=transform)\n",
+        "train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=os.cpu_count())\n",
+        "\n",
+        "\n",
+        "\n",
+        "# --- Initialize model, optimizer ------------------------------------------------------------------\n",
+        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "model = VAE(latent_dim=20).to(device)\n",
+        "optimizer = optim.Adam(model.parameters(), lr=1e-3)\n",
+        "\n",
+        "\n",
+        "\n",
+        "# --- Training loop --------------------------------------------------------------------------------\n",
+        "def train(epoch):\n",
+        "    update_utilization_lists()\n",
+        "    model.train()\n",
+        "    \n",
+        "    train_loss = 0\n",
+        "    for batch_idx, (data, _) in enumerate(train_loader):\n",
+        "        update_utilization_lists()\n",
+        "        \n",
+        "        data = data.to(device)\n",
+        "        update_utilization_lists()\n",
+        "        \n",
+        "        optimizer.zero_grad()\n",
+        "        update_utilization_lists()\n",
+        "        \n",
+        "        recon_batch, mu, logvar = model(data)\n",
+        "        update_utilization_lists()\n",
+        "        \n",
+        "        loss = loss_function(recon_batch, data, mu, logvar)\n",
+        "        update_utilization_lists()\n",
+        "        \n",
+        "        loss.backward()\n",
+        "        update_utilization_lists()\n",
+        "        \n",
+        "        train_loss += loss.item()\n",
+        "        update_utilization_lists()\n",
+        "        \n",
+        "        optimizer.step()\n",
+        "        update_utilization_lists()\n",
+        "\n",
+        "        if batch_idx % 100 == 0:\n",
+        "            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '\n",
+        "                  f'({100. * batch_idx / len(train_loader):.0f}%)]\\tLoss: {loss.item() / len(data):.6f}')\n",
+        "\n",
+        "    print(f'====> Epoch: {epoch} Average loss: {train_loss / len(train_loader.dataset):.4f}')\n",
+        "\n",
+        "\n",
+        "\n",
+        "# --- Train for 10 epochs --------------------------------------------------------------------------\n",
+        "for epoch in range(1,3):\n",
+        "    update_utilization_lists()\n",
+        "    train(epoch)\n",
+        "    update_utilization_lists()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "6M9KOwxshmZF",
+        "outputId": "274be81e-b8a7-4100-f6d8-235d5a8ffb6d"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"CPU RAM (MB):\", min(cpu_ram_mb), max(cpu_ram_mb))\n",
+        "print(\"CPU RAM (%):\", min(cpu_ram_percent), max(cpu_ram_percent))\n",
+        "if torch.cuda.is_available():\n",
+        "    print(\"GPU RAM (MB):\", min(gpu_ram_mb), max(gpu_ram_mb))\n",
+        "    print(\"GPU RAM (%):\", min(gpu_ram_percent), max(gpu_ram_percent))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 400
+        },
+        "id": "mKdK390Ehq7u",
+        "outputId": "524a035c-98c5-4c45-99c8-96a882007427"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(21, 8))\n",
+        "\n",
+        "# --- Plot CPU RAM Utilization (MB) ----------------------------------------------------------------\n",
+        "plt.subplot(2, 2, 1)\n",
+        "plt.plot(range(len(timestamps)), cpu_ram_mb, label=\"CPU RAM (MB)\")\n",
+        "plt.title(\"CPU RAM Utilization (MB)\")\n",
+        "plt.xlabel(\"Time\")\n",
+        "plt.ylabel(\"MB\")\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.grid(True)\n",
+        "plt.legend()\n",
+        "\n",
+        "# --- Plot CPU RAM Utilization (%) -----------------------------------------------------------------\n",
+        "plt.subplot(2, 2, 2)\n",
+        "plt.plot(range(len(timestamps)), cpu_ram_percent, label=\"CPU RAM (%)\", color=\"orange\")\n",
+        "plt.title(\"CPU RAM Utilization (%)\")\n",
+        "plt.xlabel(\"Time\")\n",
+        "plt.ylabel(\"Percentage\")\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.grid(True)\n",
+        "plt.legend()\n",
+        "\n",
+        "# --- Plot GPU RAM Utilization (MB) if GPU exists --------------------------------------------------\n",
+        "if torch.cuda.is_available():\n",
+        "    plt.subplot(2, 2, 3)\n",
+        "    plt.plot(range(len(timestamps)), gpu_ram_mb, label=\"GPU RAM (MB)\", color=\"green\")\n",
+        "    plt.title(\"GPU RAM Utilization (MB)\")\n",
+        "    plt.xlabel(\"Time\")\n",
+        "    plt.ylabel(\"MB\")\n",
+        "    plt.xticks(rotation=45)\n",
+        "    plt.grid(True)\n",
+        "    plt.legend()\n",
+        "\n",
+        "\n",
+        "# --- Plot GPU RAM Utilization (%) if GPU exists ---------------------------------------------------\n",
+        "    plt.subplot(2, 2, 4)\n",
+        "    plt.plot(range(len(timestamps)), gpu_ram_percent, label=\"GPU RAM (%)\", color=\"red\")\n",
+        "    plt.title(\"GPU RAM Utilization (%)\")\n",
+        "    plt.xlabel(\"Time\")\n",
+        "    plt.ylabel(\"Percentage\")\n",
+        "    plt.xticks(rotation=45)\n",
+        "    plt.grid(True)\n",
+        "    plt.legend()\n",
+        "\n",
+        "\n",
+        "plt.tight_layout()\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "if torch.cuda.is_available():\n",
+        "    fig.add_trace(\n",
+        "        go.Scatter(x=list(range(len(timestamps))), y=gpu_ram_mb, mode='lines', name='GPU RAM (MB)', line=dict(color='green')),\n",
+        "        row=2, col=1\n",
+        "    )\n",
+        "fig.show()    "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import plotly.graph_objects as go\n",
+        "from plotly.subplots import make_subplots\n",
+        "import torch\n",
+        "\n",
+        "# Create subplots\n",
+        "fig = make_subplots(\n",
+        "    rows=2, cols=2,\n",
+        "    subplot_titles=(\"CPU RAM Utilization (MB)\", \"CPU RAM Utilization (%)\",\n",
+        "                    \"GPU RAM Utilization (MB)\", \"GPU RAM Utilization (%)\")\n",
+        ")\n",
+        "\n",
+        "# Plot CPU RAM Utilization (MB)\n",
+        "fig.add_trace(\n",
+        "    go.Scatter(x=list(range(len(timestamps))), y=cpu_ram_mb, mode='lines', name='CPU RAM (MB)'),\n",
+        "    row=1, col=1\n",
+        ")\n",
+        "\n",
+        "# Plot CPU RAM Utilization (%)\n",
+        "fig.add_trace(\n",
+        "    go.Scatter(x=list(range(len(timestamps))), y=cpu_ram_percent, mode='lines', name='CPU RAM (%)', line=dict(color='orange')),\n",
+        "    row=1, col=2\n",
+        ")\n",
+        "\n",
+        "# Plot GPU RAM Utilization (MB) if GPU exists\n",
+        "if torch.cuda.is_available():\n",
+        "    fig.add_trace(\n",
+        "        go.Scatter(x=list(range(len(timestamps))), y=gpu_ram_mb, mode='lines', name='GPU RAM (MB)', line=dict(color='green')),\n",
+        "        row=2, col=1\n",
+        "    )\n",
+        "\n",
+        "    # Plot GPU RAM Utilization (%)\n",
+        "    fig.add_trace(\n",
+        "        go.Scatter(x=list(range(len(timestamps))), y=gpu_ram_percent, mode='lines', name='GPU RAM (%)', line=dict(color='red')),\n",
+        "        row=2, col=2\n",
+        "    )\n",
+        "\n",
+        "# Update layout\n",
+        "fig.update_layout(\n",
+        "    height=800, width=1200,\n",
+        "    title_text=\"System Resource Utilization\",\n",
+        "    showlegend=True\n",
+        ")\n",
+        "\n",
+        "fig.update_xaxes(title_text=\"Time\", tickangle=45)\n",
+        "fig.update_yaxes(title_text=\"MB or Percentage\")\n",
+        "\n",
+        "# Show plot\n",
+        "fig.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 454
+        },
+        "id": "3MGfGd_Ojcrf",
+        "outputId": "f1091984-2658-4053-ff08-c7c300c08d0e"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(21, 4))\n",
+        "\n",
+        "r = 12000 # range(len(timestamps))\n",
+        "x, y = range(r), cpu_ram_mb[:r]\n",
+        "\n",
+        "plt.plot(x, y, label=\"CPU RAM (MB)\")\n",
+        "plt.title(\"CPU RAM Utilization (MB)\")\n",
+        "plt.xlabel(\"Time\")\n",
+        "plt.ylabel(\"MB\")\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.grid(True)\n",
+        "plt.legend()\n",
+        "plt.tight_layout()\n",
+        "plt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "cuda_env2",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.2"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

DDPM/_5_Activation-Ckpt-VAE-CelebA.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Imgui/demo-newstyle.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# -*- coding: utf-8 -*-
+import os
+import sys
+# For Linux/Wayland users.
+if os.getenv("XDG_SESSION_TYPE") == "wayland":
+    os.environ["XDG_SESSION_TYPE"] = "x11"
+import glfw
+import OpenGL.GL as gl
+import imgui
+from imgui.integrations.glfw import GlfwRenderer
+active = {
+    "window": True,
+    "child": False,
+    "tooltip": False,
+    "menu bar": False,
+    "popup": False,
+    "popup modal": False,
+    "popup context item": False,
+    "popup context window": False,
+    "drag drop": False,
+    "group": False,
+    "tab bar": False,
+    "list box": False,
+    "popup context void": False,
+    "table": False,
+}
+path_to_font = None  # "path/to/font.ttf"
+opened_state = True
+# Frame commands from the video
+# def frame_commands():
+#     io = imgui.get_io()
+#     if io.key_ctrl and io.keys_down[glfw.KEY_Q]:
+#         sys.exit(0)
+#
+#     if imgui.begin_main_menu_bar():
+#         if imgui.begin_menu("File"):
+#             clicked, selected = imgui.menu_item("Quit", "Ctrl+Q")
+#             if clicked:
+#                 sys.exit(0)
+#             imgui.end_menu()
+#         imgui.end_main_menu_bar()
+#
+#     with imgui.begin("A Window!"):
+#         if imgui.button("select"):
+#             imgui.open_popup("select-popup")
+#
+#         try:
+#             with imgui.begin_popup("select-popup") as popup:
+#                 if popup.opened:
+#                     imgui.text("Select one")
+#                     raise Exception
+#         except Exception:
+#             print("caught exception and no crash!")
+def frame_commands():
+    io = imgui.get_io()
+    if io.key_ctrl and io.keys_down[glfw.KEY_Q]:
+        sys.exit(0)
+    with imgui.begin_main_menu_bar() as main_menu_bar:
+        if main_menu_bar.opened:
+            with imgui.begin_menu("File", True) as file_menu:
+                if file_menu.opened:
+                    clicked_quit, selected_quit = imgui.menu_item("Quit", "Ctrl+Q")
+                    if clicked_quit:
+                        sys.exit(0)
+    # turn examples on/off
+    with imgui.begin("Active examples"):
+        for label, enabled in active.copy().items():
+            _, enabled = imgui.checkbox(label, enabled)
+            active[label] = enabled
+    if active["window"]:
+        with imgui.begin("Hello, Imgui!"):
+            imgui.text("Hello, World!")
+    if active["child"]:
+        with imgui.begin("Example: child region"):
+            with imgui.begin_child("region", 150, -50, border=True):
+                imgui.text("inside region")
+            imgui.text("outside region")
+    if active["tooltip"]:
+        with imgui.begin("Example: tooltip"):
+            imgui.button("Click me!")
+            if imgui.is_item_hovered():
+                with imgui.begin_tooltip():
+                    imgui.text("This button is clickable.")
+    if active["menu bar"]:
+        try:
+            flags = imgui.WINDOW_MENU_BAR
+            with imgui.begin("Child Window - File Browser", flags=flags):
+                with imgui.begin_menu_bar() as menu_bar:
+                    if menu_bar.opened:
+                        with imgui.begin_menu('File') as file_menu:
+                            if file_menu.opened:
+                                clicked, state = imgui.menu_item('Close')
+                                if clicked:
+                                    active["menu bar"] = False
+                                    raise Exception
+        except Exception:
+            print("exception handled")
+    if active["popup"]:
+        with imgui.begin("Example: simple popup"):
+            if imgui.button("select"):
+                imgui.open_popup("select-popup")
+            imgui.same_line()
+            with imgui.begin_popup("select-popup") as popup:
+                if popup.opened:
+                    imgui.text("Select one")
+                    imgui.separator()
+                    imgui.selectable("One")
+                    imgui.selectable("Two")
+                    imgui.selectable("Three")
+    if active["popup modal"]:
+        with imgui.begin("Example: simple popup modal"):
+            if imgui.button("Open Modal popup"):
+                imgui.open_popup("select-popup-modal")
+            imgui.same_line()
+            with imgui.begin_popup_modal("select-popup-modal") as popup:
+                if popup.opened:
+                    imgui.text("Select an option:")
+                    imgui.separator()
+                    imgui.selectable("One")
+                    imgui.selectable("Two")
+                    imgui.selectable("Three")
+    if active["popup context item"]:
+        with imgui.begin("Example: popup context view"):
+            imgui.text("Right-click to set value.")
+            with imgui.begin_popup_context_item("Item Context Menu") as popup:
+                if popup.opened:
+                    imgui.selectable("Set to Zero")
+    if active["popup context window"]:
+        with imgui.begin("Example: popup context window"):
+            with imgui.begin_popup_context_window() as popup:
+                if popup.opened:
+                    imgui.selectable("Clear")
+    if active["popup context void"]:
+        with imgui.begin_popup_context_void() as popup:
+            if popup.opened:
+                imgui.selectable("Clear")
+    if active["drag drop"]:
+        with imgui.begin("Example: drag and drop"):
+            imgui.button('source')
+            with imgui.begin_drag_drop_source() as src:
+                if src.dragging:
+                    imgui.set_drag_drop_payload('itemtype', b'payload')
+                    imgui.button('dragged source')
+            imgui.button('dest')
+            with imgui.begin_drag_drop_target() as dst:
+                if dst.hovered:
+                    payload = imgui.accept_drag_drop_payload('itemtype')
+                    if payload is not None:
+                        print('Received:', payload)
+    if active["group"]:
+        with imgui.begin("Example: item groups"):
+            with imgui.begin_group():
+                imgui.text("First group (buttons):")
+                imgui.button("Button A")
+                imgui.button("Button B")
+            imgui.same_line(spacing=50)
+            with imgui.begin_group():
+                imgui.text("Second group (text and bullet texts):")
+                imgui.bullet_text("Bullet A")
+                imgui.bullet_text("Bullet B")
+    if active["tab bar"]:
+        with imgui.begin("Example Tab Bar"):
+            with imgui.begin_tab_bar("MyTabBar") as tab_bar:
+                if tab_bar.opened:
+                    with imgui.begin_tab_item("Item 1") as item1:
+                        if item1.opened:
+                            imgui.text("Here is the tab content!")
+                    with imgui.begin_tab_item("Item 2") as item2:
+                        if item2.opened:
+                            imgui.text("Another content...")
+                    global opened_state
+                    with imgui.begin_tab_item("Item 3", opened=opened_state) as item3:
+                        opened_state = item3.opened
+                        if item3.selected:
+                            imgui.text("Hello Saylor!")
+    if active["list box"]:
+        with imgui.begin("Example: custom listbox"):
+            with imgui.begin_list_box("List", 200, 100) as list_box:
+                if list_box.opened:
+                    imgui.selectable("Selected", True)
+                    imgui.selectable("Not Selected", False)
+    if active["table"]:
+        with imgui.begin("Example: table"):
+            with imgui.begin_table("data", 2) as table:
+                if table.opened:
+                    imgui.table_next_column()
+                    imgui.table_header("A")
+                    imgui.table_next_column()
+                    imgui.table_header("B")
+                    imgui.table_next_row()
+                    imgui.table_next_column()
+                    imgui.text("123")
+                    imgui.table_next_column()
+                    imgui.text("456")
+                    imgui.table_next_row()
+                    imgui.table_next_column()
+                    imgui.text("789")
+                    imgui.table_next_column()
+                    imgui.text("111")
+                    imgui.table_next_row()
+                    imgui.table_next_column()
+                    imgui.text("222")
+                    imgui.table_next_column()
+                    imgui.text("333")
+def render_frame(impl, window, font):
+    glfw.poll_events()
+    impl.process_inputs()
+    imgui.new_frame()
+    gl.glClearColor(0.1, 0.1, 0.1, 1)
+    gl.glClear(gl.GL_COLOR_BUFFER_BIT)
+    if font is not None:
+        imgui.push_font(font)
+    frame_commands()
+    if font is not None:
+        imgui.pop_font()
+    imgui.render()
+    impl.render(imgui.get_draw_data())
+    glfw.swap_buffers(window)
+def impl_glfw_init():
+    width, height = 1600, 900
+    window_name = "minimal ImGui/GLFW3 example"
+    if not glfw.init():
+        print("Could not initialize OpenGL context")
+        sys.exit(1)
+    glfw.window_hint(glfw.CONTEXT_VERSION_MAJOR, 3)
+    glfw.window_hint(glfw.CONTEXT_VERSION_MINOR, 3)
+    glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_CORE_PROFILE)
+    glfw.window_hint(glfw.OPENGL_FORWARD_COMPAT, gl.GL_TRUE)
+    window = glfw.create_window(int(width), int(height), window_name, None, None)
+    glfw.make_context_current(window)
+    if not window:
+        glfw.terminate()
+        print("Could not initialize Window")
+        sys.exit(1)
+    return window
+def main():
+    imgui.create_context()
+    window = impl_glfw_init()
+    impl = GlfwRenderer(window)
+    io = imgui.get_io()
+    jb = io.fonts.add_font_from_file_ttf(path_to_font, 30) if path_to_font is not None else None
+    impl.refresh_font_texture()
+    while not glfw.window_should_close(window):
+        render_frame(impl, window, jb)
+    impl.shutdown()
+    glfw.terminate()
+if __name__ == "__main__":
+    main()

Imgui/demo.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# pip install glfw
+# pip install PyOpenGL
+# pip install imgui
+# -*- coding: utf-8 -*-
+import os
+import sys
+# For Linux/Wayland users.
+if os.getenv("XDG_SESSION_TYPE") == "wayland":
+    os.environ["XDG_SESSION_TYPE"] = "x11"
+import glfw
+import OpenGL.GL as gl
+import imgui
+from imgui.integrations.glfw import GlfwRenderer
+active = {
+    "window": True,
+    "child": False,
+    "tooltip": False,
+    "menu bar": False,
+    "popup": False,
+    "popup modal": False,
+    "popup context item": False,
+    "popup context window": False,
+    "drag drop": False,
+    "group": False,
+    "tab bar": False,
+    "list box": False,
+    "popup context void": False,
+    "table": False,
+}
+path_to_font = None  # "path/to/font.ttf"
+opened_state = True
+def frame_commands():
+    gl.glClearColor(0.1, 0.1, 0.1, 1)
+    gl.glClear(gl.GL_COLOR_BUFFER_BIT)
+    io = imgui.get_io()
+    if io.key_ctrl and io.keys_down[glfw.KEY_Q]:
+        sys.exit(0)
+    if imgui.begin_main_menu_bar():
+        if imgui.begin_menu("File", True):
+            clicked_quit, selected_quit = imgui.menu_item("Quit", "Ctrl+Q", False, True)
+            if clicked_quit:
+                sys.exit(0)
+            imgui.end_menu()
+        imgui.end_main_menu_bar()
+    # turn windows on/off
+    imgui.begin("Active examples")
+    for label, enabled in active.copy().items():
+        _, enabled = imgui.checkbox(label, enabled)
+        active[label] = enabled
+    imgui.end()
+    if active["window"]:
+        imgui.begin("Hello, Imgui!")
+        imgui.text("Hello, World!")
+        imgui.end()
+    if active["child"]:
+        imgui.begin("Example: child region")
+        imgui.begin_child("region", 150, -50, border=True)
+        imgui.text("inside region")
+        imgui.end_child()
+        imgui.text("outside region")
+        imgui.end()
+    if active["tooltip"]:
+        imgui.begin("Example: tooltip")
+        imgui.button("Click me!")
+        if imgui.is_item_hovered():
+            imgui.begin_tooltip()
+            imgui.text("This button is clickable.")
+            imgui.end_tooltip()
+        imgui.end()
+    if active["menu bar"]:
+        try:
+            flags = imgui.WINDOW_MENU_BAR
+            imgui.begin("Child Window - File Browser", flags=flags)
+            if imgui.begin_menu_bar():
+                if imgui.begin_menu('File'):
+                    clicked, state = imgui.menu_item('Close')
+                    if clicked:
+                        active["menu bar"] = False
+                        raise Exception
+                    imgui.end_menu()
+                imgui.end_menu_bar()
+            imgui.end()
+        except Exception:
+            print("exception caught, but too late!")
+    if active["popup"]:
+        imgui.begin("Example: simple popup")
+        if imgui.button("select"):
+            imgui.open_popup("select-popup")
+        imgui.same_line()
+        if imgui.begin_popup("select-popup"):
+            imgui.text("Select one")
+            imgui.separator()
+            imgui.selectable("One")
+            imgui.selectable("Two")
+            imgui.selectable("Three")
+            imgui.end_popup()
+        imgui.end()
+    if active["popup modal"]:
+        imgui.begin("Example: simple popup modal")
+        if imgui.button("Open Modal popup"):
+            imgui.open_popup("select-popup-modal")
+        imgui.same_line()
+        if imgui.begin_popup_modal("select-popup-modal")[0]:
+            imgui.text("Select an option:")
+            imgui.separator()
+            imgui.selectable("One")
+            imgui.selectable("Two")
+            imgui.selectable("Three")
+            imgui.end_popup()
+        imgui.end()
+    if active["popup context item"]:
+        imgui.begin("Example: popup context view")
+        imgui.text("Right-click to set value.")
+        if imgui.begin_popup_context_item("Item Context Menu"):
+            imgui.selectable("Set to Zero")
+            imgui.end_popup()
+        imgui.end()
+    if active["popup context window"]:
+        imgui.begin("Example: popup context window")
+        if imgui.begin_popup_context_window():
+            imgui.selectable("Clear")
+            imgui.end_popup()
+        imgui.end()
+    if active["popup context void"]:
+        if imgui.begin_popup_context_void():
+            imgui.selectable("Clear")
+            imgui.end_popup()
+    if active["drag drop"]:
+        imgui.begin("Example: drag and drop")
+        imgui.button('source')
+        if imgui.begin_drag_drop_source():
+            imgui.set_drag_drop_payload('itemtype', b'payload')
+            imgui.button('dragged source')
+            imgui.end_drag_drop_source()
+        imgui.button('dest')
+        if imgui.begin_drag_drop_target():
+            payload = imgui.accept_drag_drop_payload('itemtype')
+            if payload is not None:
+                print('Received:', payload)
+            imgui.end_drag_drop_target()
+        imgui.end()
+    if active["group"]:
+        imgui.begin("Example: item groups")
+        imgui.begin_group()
+        imgui.text("First group (buttons):")
+        imgui.button("Button A")
+        imgui.button("Button B")
+        imgui.end_group()
+        imgui.same_line(spacing=50)
+        imgui.begin_group()
+        imgui.text("Second group (text and bullet texts):")
+        imgui.bullet_text("Bullet A")
+        imgui.bullet_text("Bullet B")
+        imgui.end_group()
+        imgui.end()
+    if active["tab bar"]:
+        imgui.begin("Example Tab Bar")
+        if imgui.begin_tab_bar("MyTabBar"):
+            if imgui.begin_tab_item("Item 1")[0]:
+                imgui.text("Here is the tab content!")
+                imgui.end_tab_item()
+            if imgui.begin_tab_item("Item 2")[0]:
+                imgui.text("Another content...")
+                imgui.end_tab_item()
+            global opened_state
+            selected, opened_state = imgui.begin_tab_item("Item 3", opened=opened_state)
+            if selected:
+                imgui.text("Hello Saylor!")
+                imgui.end_tab_item()
+            imgui.end_tab_bar()
+        imgui.end()
+    if active["list box"]:
+        imgui.begin("Example: custom listbox")
+        if imgui.begin_list_box("List", 200, 100):
+            imgui.selectable("Selected", True)
+            imgui.selectable("Not Selected", False)
+            imgui.end_list_box()
+        imgui.end()
+    if active["table"]:
+        imgui.begin("Example: table")
+        if imgui.begin_table("data", 2):
+            imgui.table_next_column()
+            imgui.table_header("A")
+            imgui.table_next_column()
+            imgui.table_header("B")
+            imgui.table_next_row()
+            imgui.table_next_column()
+            imgui.text("123")
+            imgui.table_next_column()
+            imgui.text("456")
+            imgui.table_next_row()
+            imgui.table_next_column()
+            imgui.text("789")
+            imgui.table_next_column()
+            imgui.text("111")
+            imgui.table_next_row()
+            imgui.table_next_column()
+            imgui.text("222")
+            imgui.table_next_column()
+            imgui.text("333")
+            imgui.end_table()
+        imgui.end()
+def render_frame(impl, window, font):
+    glfw.poll_events()
+    impl.process_inputs()
+    imgui.new_frame()
+    gl.glClearColor(0.1, 0.1, 0.1, 1)
+    gl.glClear(gl.GL_COLOR_BUFFER_BIT)
+    if font is not None:
+        imgui.push_font(font)
+    frame_commands()
+    if font is not None:
+        imgui.pop_font()
+    imgui.render()
+    impl.render(imgui.get_draw_data())
+    glfw.swap_buffers(window)
+def impl_glfw_init():
+    width, height = 1600, 900
+    window_name = "minimal ImGui/GLFW3 example"
+    if not glfw.init():
+        print("Could not initialize OpenGL context")
+        sys.exit(1)
+    glfw.window_hint(glfw.CONTEXT_VERSION_MAJOR, 3)
+    glfw.window_hint(glfw.CONTEXT_VERSION_MINOR, 3)
+    glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_CORE_PROFILE)
+    glfw.window_hint(glfw.OPENGL_FORWARD_COMPAT, gl.GL_TRUE)
+    window = glfw.create_window(int(width), int(height), window_name, None, None)
+    glfw.make_context_current(window)
+    if not window:
+        glfw.terminate()
+        print("Could not initialize Window")
+        sys.exit(1)
+    return window
+def main():
+    imgui.create_context()
+    window = impl_glfw_init()
+    impl = GlfwRenderer(window)
+    io = imgui.get_io()
+    jb = io.fonts.add_font_from_file_ttf(path_to_font, 30) if path_to_font is not None else None
+    impl.refresh_font_texture()
+    while not glfw.window_should_close(window):
+        render_frame(impl, window, jb)
+    impl.shutdown()
+    glfw.terminate()
+if __name__ == "__main__":
+    main()

Imgui/imgui.ini ADDED Viewed

	@@ -0,0 +1,25 @@

+[Window][Debug##Default]
+Pos=60,60
+Size=400,400
+Collapsed=0
+[Window][Active examples]
+Pos=21,83
+Size=179,353
+Collapsed=0
+[Window][Hello, Imgui!]
+Pos=60,60
+Size=107,48
+Collapsed=0
+[Window][Example: table]
+Pos=60,60
+Size=66,103
+Collapsed=0
+[Window][Example: drag and drop]
+Pos=60,60
+Size=66,77
+Collapsed=0

LDM/notebooks/_1_Main.ipynb ADDED Viewed

	@@ -0,0 +1,1481 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "device(type='cuda')"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import numpy as np\n",
+    "from collections import namedtuple\n",
+    "\n",
+    "import pandas as pd\n",
+    "import torchvision as tv\n",
+    "from torchvision.transforms import v2\n",
+    "from tqdm.auto import tqdm, trange\n",
+    "\n",
+    "import yaml\n",
+    "from dotdict import DotDict\n",
+    "import random\n",
+    "import torch.hub\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from torchvision.utils import make_grid\n",
+    "\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "device"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### *LPIPS*: Learned Perceptual Image Patch Similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class vgg16(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(vgg16, self).__init__()\n",
+    "        vgg_pretrained_features = tv.models.vgg16(\n",
+    "            weights=tv.models.VGG16_Weights.IMAGENET1K_V1\n",
+    "        ).features\n",
+    "        self.slice1 = torch.nn.Sequential()\n",
+    "        self.slice2 = torch.nn.Sequential()\n",
+    "        self.slice3 = torch.nn.Sequential()\n",
+    "        self.slice4 = torch.nn.Sequential()\n",
+    "        self.slice5 = torch.nn.Sequential()\n",
+    "        self.N_slices = 5\n",
+    "        for x in range(4):\n",
+    "            self.slice1.add_module(str(x), vgg_pretrained_features[x])\n",
+    "        for x in range(4, 9):\n",
+    "            self.slice2.add_module(str(x), vgg_pretrained_features[x])\n",
+    "        for x in range(9, 16):\n",
+    "            self.slice3.add_module(str(x), vgg_pretrained_features[x])\n",
+    "        for x in range(16, 23):\n",
+    "            self.slice4.add_module(str(x), vgg_pretrained_features[x])\n",
+    "        for x in range(23, 30):\n",
+    "            self.slice5.add_module(str(x), vgg_pretrained_features[x])\n",
+    "            \n",
+    "        self.eval()\n",
+    "        for param in self.parameters():\n",
+    "            param.requires_grad = False\n",
+    "\n",
+    "    def forward(self, X):\n",
+    "        h1 = self.slice1(X)\n",
+    "        h2 = self.slice2(h1)\n",
+    "        h3 = self.slice3(h2)\n",
+    "        h4 = self.slice4(h3)\n",
+    "        h5 = self.slice5(h4)\n",
+    "        vgg_outputs = namedtuple(\"VggOutputs\", ['h1', 'h2', 'h3', 'h4', 'h5'])\n",
+    "        out = vgg_outputs(h1, h2, h3, h4, h5)\n",
+    "        return out\n",
+    "\n",
+    "\n",
+    "def _spatial_average(in_tens, keepdim=True):\n",
+    "    return in_tens.mean([2, 3], keepdim=keepdim)\n",
+    "\n",
+    "\n",
+    "def _normalize_tensor(in_feat, eps= 1e-8):\n",
+    "    norm_factor = torch.sqrt(eps + torch.sum(in_feat**2, dim=1, keepdim=True))\n",
+    "    return in_feat / norm_factor\n",
+    "\n",
+    "\n",
+    "class ScalingLayer(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(ScalingLayer, self).__init__()\n",
+    "        # Imagnet normalization for (0-1)\n",
+    "        # mean = [0.485, 0.456, 0.406]\n",
+    "        # std = [0.229, 0.224, 0.225]\n",
+    "\n",
+    "        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])\n",
+    "        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])\n",
+    "\n",
+    "    def forward(self, inp):\n",
+    "        return (inp - self.shift) / self.scale\n",
+    "\n",
+    "\n",
+    "class NetLinLayer(nn.Module):\n",
+    "    ''' A single linear layer which does a 1x1 conv '''\n",
+    "    def __init__(self, chn_in, chn_out=1, use_dropout=False):\n",
+    "        super(NetLinLayer, self).__init__()\n",
+    "        layers = [nn.Dropout(), ] if (use_dropout) else []\n",
+    "        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]\n",
+    "        self.model = nn.Sequential(*layers)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.model(x)\n",
+    "\n",
+    "\n",
+    "class LPIPS(nn.Module):\n",
+    "    def __init__(self, net='vgg', version='0.1', use_dropout=True):\n",
+    "        super(LPIPS, self).__init__()\n",
+    "        self.version = version\n",
+    "        self.scaling_layer = ScalingLayer()\n",
+    "        self.chns = [64, 128, 256, 512, 512]\n",
+    "        self.L = len(self.chns)\n",
+    "        self.net = vgg16()\n",
+    "        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)\n",
+    "        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)\n",
+    "        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)\n",
+    "        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)\n",
+    "        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)\n",
+    "        self.lins = nn.ModuleList([self.lin0, self.lin1, self.lin2, self.lin3, self.lin4])\n",
+    "\n",
+    "        # --- Orignal url --------------------\n",
+    "        # weights_url = f\"https://github.com/richzhang/PerceptualSimilarity/raw/master/lpips/weights/v{version}/{net}.pth\"\n",
+    "        \n",
+    "        # --- Orignal Forked url -------------\n",
+    "        weights_url = f\"https://github.com/akuresonite/PerceptualSimilarity-Forked/raw/master/lpips/weights/v{version}/{net}.pth\"\n",
+    "        \n",
+    "        # --- Orignal torchmetric url --------\n",
+    "        # weights_url = \"https://github.com/Lightning-AI/torchmetrics/raw/master/src/torchmetrics/functional/image/lpips_models/vgg.pth\"\n",
+    "        \n",
+    "        state_dict = torch.hub.load_state_dict_from_url(weights_url, map_location='cpu')\n",
+    "        self.load_state_dict(state_dict, strict=False)\n",
+    "        \n",
+    "        self.eval()\n",
+    "        for param in self.parameters():\n",
+    "            param.requires_grad = False\n",
+    "\n",
+    "    def forward(self, in0, in1, normalize=False):\n",
+    "        # Scale the inputs to -1 to +1 range if input in [0,1]\n",
+    "        if normalize:\n",
+    "            in0 = 2 * in0 - 1\n",
+    "            in1 = 2 * in1 - 1\n",
+    "\n",
+    "        in0_input, in1_input = self.scaling_layer(in0), self.scaling_layer(in1)\n",
+    "        # in0_input, in1_input = in0, in1\n",
+    "        \n",
+    "        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)\n",
+    "        \n",
+    "        diffs = {}\n",
+    "        for kk in range(self.L):\n",
+    "            feats0 = _normalize_tensor(outs0[kk])\n",
+    "            feats1 = _normalize_tensor(outs1[kk])\n",
+    "            diffs[kk] = (feats0 - feats1) ** 2\n",
+    "            \n",
+    "        res = [_spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]\n",
+    "        val = sum(res)\n",
+    "        return val.reshape(-1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Discriminator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Discriminator(nn.Module):\n",
+    "    r\"\"\"\n",
+    "    PatchGAN Discriminator.\n",
+    "    Rather than taking IMG_CHANNELSxIMG_HxIMG_W all the way to\n",
+    "    1 scalar value , we instead predict grid of values.\n",
+    "    Where each grid is prediction of how likely\n",
+    "    the discriminator thinks that the image patch corresponding\n",
+    "    to the grid cell is real\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        im_channels=3,\n",
+    "        conv_channels=[64, 128, 256],\n",
+    "        kernels=[4, 4, 4, 4],\n",
+    "        strides=[2, 2, 2, 1],\n",
+    "        paddings=[1, 1, 1, 1],\n",
+    "    ):\n",
+    "        super().__init__()\n",
+    "        self.im_channels = im_channels\n",
+    "        activation = nn.LeakyReLU(0.2)\n",
+    "        layers_dim = [self.im_channels] + conv_channels + [1]\n",
+    "        self.layers = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Sequential(\n",
+    "                    nn.Conv2d(\n",
+    "                        layers_dim[i],\n",
+    "                        layers_dim[i + 1],\n",
+    "                        kernel_size=kernels[i],\n",
+    "                        stride=strides[i],\n",
+    "                        padding=paddings[i],\n",
+    "                        bias=False if i != 0 else True,\n",
+    "                    ),\n",
+    "                    (\n",
+    "                        nn.BatchNorm2d(layers_dim[i + 1])\n",
+    "                        if i != len(layers_dim) - 2 and i != 0\n",
+    "                        else nn.Identity()\n",
+    "                    ),\n",
+    "                    activation if i != len(layers_dim) - 2 else nn.Identity(),\n",
+    "                )\n",
+    "                for i in range(len(layers_dim) - 1)\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        out = x\n",
+    "        for layer in self.layers:\n",
+    "            out = layer(out)\n",
+    "        return out"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### *VQVAE*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DownBlock(nn.Module):\n",
+    "    r\"\"\"\n",
+    "    Down conv block with attention.\n",
+    "    Sequence of following block\n",
+    "    1. Resnet block with time embedding\n",
+    "    2. Attention block\n",
+    "    3. Downsample\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        in_channels,\n",
+    "        out_channels,\n",
+    "        t_emb_dim,\n",
+    "        down_sample,\n",
+    "        num_heads,\n",
+    "        num_layers,\n",
+    "        attn,\n",
+    "        norm_channels,\n",
+    "        cross_attn=False,\n",
+    "        context_dim=None,\n",
+    "    ):\n",
+    "        super().__init__()\n",
+    "        self.num_layers = num_layers\n",
+    "        self.down_sample = down_sample\n",
+    "        self.attn = attn\n",
+    "        self.context_dim = context_dim\n",
+    "        self.cross_attn = cross_attn\n",
+    "        self.t_emb_dim = t_emb_dim\n",
+    "        self.resnet_conv_first = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Sequential(\n",
+    "                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),\n",
+    "                    nn.SiLU(),\n",
+    "                    nn.Conv2d(\n",
+    "                        in_channels if i == 0 else out_channels,\n",
+    "                        out_channels,\n",
+    "                        kernel_size=3,\n",
+    "                        stride=1,\n",
+    "                        padding=1,\n",
+    "                    ),\n",
+    "                )\n",
+    "                for i in range(num_layers)\n",
+    "            ]\n",
+    "        )\n",
+    "        if self.t_emb_dim is not None:\n",
+    "            self.t_emb_layers = nn.ModuleList(\n",
+    "                [\n",
+    "                    nn.Sequential(nn.SiLU(), nn.Linear(self.t_emb_dim, out_channels))\n",
+    "                    for _ in range(num_layers)\n",
+    "                ]\n",
+    "            )\n",
+    "        self.resnet_conv_second = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Sequential(\n",
+    "                    nn.GroupNorm(norm_channels, out_channels),\n",
+    "                    nn.SiLU(),\n",
+    "                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),\n",
+    "                )\n",
+    "                for _ in range(num_layers)\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "        if self.attn:\n",
+    "            self.attention_norms = nn.ModuleList(\n",
+    "                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]\n",
+    "            )\n",
+    "\n",
+    "            self.attentions = nn.ModuleList(\n",
+    "                [\n",
+    "                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)\n",
+    "                    for _ in range(num_layers)\n",
+    "                ]\n",
+    "            )\n",
+    "        if self.cross_attn:\n",
+    "            assert context_dim is not None, \"Context Dimension must be passed for cross attention\"\n",
+    "            self.cross_attention_norms = nn.ModuleList(\n",
+    "                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]\n",
+    "            )\n",
+    "            self.cross_attentions = nn.ModuleList(\n",
+    "                [\n",
+    "                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)\n",
+    "                    for _ in range(num_layers)\n",
+    "                ]\n",
+    "            )\n",
+    "            self.context_proj = nn.ModuleList(\n",
+    "                [nn.Linear(context_dim, out_channels) for _ in range(num_layers)]\n",
+    "            )\n",
+    "        self.residual_input_conv = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)\n",
+    "                for i in range(num_layers)\n",
+    "            ]\n",
+    "        )\n",
+    "        self.down_sample_conv = (\n",
+    "            nn.Conv2d(out_channels, out_channels, 4, 2, 1) if self.down_sample else nn.Identity()\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x, t_emb=None, context=None):\n",
+    "        out = x\n",
+    "        for i in range(self.num_layers):\n",
+    "            # Resnet block of Unet\n",
+    "\n",
+    "            resnet_input = out\n",
+    "            out = self.resnet_conv_first[i](out)\n",
+    "            if self.t_emb_dim is not None:\n",
+    "                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]\n",
+    "            out = self.resnet_conv_second[i](out)\n",
+    "            out = out + self.residual_input_conv[i](resnet_input)\n",
+    "\n",
+    "            if self.attn:\n",
+    "                # Attention block of Unet\n",
+    "\n",
+    "                batch_size, channels, h, w = out.shape\n",
+    "                in_attn = out.reshape(batch_size, channels, h * w)\n",
+    "                in_attn = self.attention_norms[i](in_attn)\n",
+    "                in_attn = in_attn.transpose(1, 2)\n",
+    "                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)\n",
+    "                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)\n",
+    "                out = out + out_attn\n",
+    "            if self.cross_attn:\n",
+    "                assert (\n",
+    "                    context is not None\n",
+    "                ), \"context cannot be None if cross attention layers are used\"\n",
+    "                batch_size, channels, h, w = out.shape\n",
+    "                in_attn = out.reshape(batch_size, channels, h * w)\n",
+    "                in_attn = self.cross_attention_norms[i](in_attn)\n",
+    "                in_attn = in_attn.transpose(1, 2)\n",
+    "                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim\n",
+    "                context_proj = self.context_proj[i](context)\n",
+    "                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)\n",
+    "                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)\n",
+    "                out = out + out_attn\n",
+    "        # Downsample\n",
+    "\n",
+    "        out = self.down_sample_conv(out)\n",
+    "        return out\n",
+    "\n",
+    "\n",
+    "class MidBlock(nn.Module):\n",
+    "    r\"\"\"\n",
+    "    Mid conv block with attention.\n",
+    "    Sequence of following blocks\n",
+    "    1. Resnet block with time embedding\n",
+    "    2. Attention block\n",
+    "    3. Resnet block with time embedding\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        in_channels,\n",
+    "        out_channels,\n",
+    "        t_emb_dim,\n",
+    "        num_heads,\n",
+    "        num_layers,\n",
+    "        norm_channels,\n",
+    "        cross_attn=None,\n",
+    "        context_dim=None,\n",
+    "    ):\n",
+    "        super().__init__()\n",
+    "        self.num_layers = num_layers\n",
+    "        self.t_emb_dim = t_emb_dim\n",
+    "        self.context_dim = context_dim\n",
+    "        self.cross_attn = cross_attn\n",
+    "        self.resnet_conv_first = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Sequential(\n",
+    "                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),\n",
+    "                    nn.SiLU(),\n",
+    "                    nn.Conv2d(\n",
+    "                        in_channels if i == 0 else out_channels,\n",
+    "                        out_channels,\n",
+    "                        kernel_size=3,\n",
+    "                        stride=1,\n",
+    "                        padding=1,\n",
+    "                    ),\n",
+    "                )\n",
+    "                for i in range(num_layers + 1)\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "        if self.t_emb_dim is not None:\n",
+    "            self.t_emb_layers = nn.ModuleList(\n",
+    "                [\n",
+    "                    nn.Sequential(nn.SiLU(), nn.Linear(t_emb_dim, out_channels))\n",
+    "                    for _ in range(num_layers + 1)\n",
+    "                ]\n",
+    "            )\n",
+    "        self.resnet_conv_second = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Sequential(\n",
+    "                    nn.GroupNorm(norm_channels, out_channels),\n",
+    "                    nn.SiLU(),\n",
+    "                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),\n",
+    "                )\n",
+    "                for _ in range(num_layers + 1)\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "        self.attention_norms = nn.ModuleList(\n",
+    "            [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]\n",
+    "        )\n",
+    "\n",
+    "        self.attentions = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.MultiheadAttention(out_channels, num_heads, batch_first=True)\n",
+    "                for _ in range(num_layers)\n",
+    "            ]\n",
+    "        )\n",
+    "        if self.cross_attn:\n",
+    "            assert context_dim is not None, \"Context Dimension must be passed for cross attention\"\n",
+    "            self.cross_attention_norms = nn.ModuleList(\n",
+    "                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]\n",
+    "            )\n",
+    "            self.cross_attentions = nn.ModuleList(\n",
+    "                [\n",
+    "                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)\n",
+    "                    for _ in range(num_layers)\n",
+    "                ]\n",
+    "            )\n",
+    "            self.context_proj = nn.ModuleList(\n",
+    "                [nn.Linear(context_dim, out_channels) for _ in range(num_layers)]\n",
+    "            )\n",
+    "        self.residual_input_conv = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)\n",
+    "                for i in range(num_layers + 1)\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x, t_emb=None, context=None):\n",
+    "        out = x\n",
+    "\n",
+    "        # First resnet block\n",
+    "\n",
+    "        resnet_input = out\n",
+    "        out = self.resnet_conv_first[0](out)\n",
+    "        if self.t_emb_dim is not None:\n",
+    "            out = out + self.t_emb_layers[0](t_emb)[:, :, None, None]\n",
+    "        out = self.resnet_conv_second[0](out)\n",
+    "        out = out + self.residual_input_conv[0](resnet_input)\n",
+    "\n",
+    "        for i in range(self.num_layers):\n",
+    "            # Attention Block\n",
+    "\n",
+    "            batch_size, channels, h, w = out.shape\n",
+    "            in_attn = out.reshape(batch_size, channels, h * w)\n",
+    "            in_attn = self.attention_norms[i](in_attn)\n",
+    "            in_attn = in_attn.transpose(1, 2)\n",
+    "            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)\n",
+    "            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)\n",
+    "            out = out + out_attn\n",
+    "\n",
+    "            if self.cross_attn:\n",
+    "                assert (\n",
+    "                    context is not None\n",
+    "                ), \"context cannot be None if cross attention layers are used\"\n",
+    "                batch_size, channels, h, w = out.shape\n",
+    "                in_attn = out.reshape(batch_size, channels, h * w)\n",
+    "                in_attn = self.cross_attention_norms[i](in_attn)\n",
+    "                in_attn = in_attn.transpose(1, 2)\n",
+    "                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim\n",
+    "                context_proj = self.context_proj[i](context)\n",
+    "                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)\n",
+    "                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)\n",
+    "                out = out + out_attn\n",
+    "            # Resnet Block\n",
+    "\n",
+    "            resnet_input = out\n",
+    "            out = self.resnet_conv_first[i + 1](out)\n",
+    "            if self.t_emb_dim is not None:\n",
+    "                out = out + self.t_emb_layers[i + 1](t_emb)[:, :, None, None]\n",
+    "            out = self.resnet_conv_second[i + 1](out)\n",
+    "            out = out + self.residual_input_conv[i + 1](resnet_input)\n",
+    "        return out\n",
+    "\n",
+    "\n",
+    "class UpBlock(nn.Module):\n",
+    "    r\"\"\"\n",
+    "    Up conv block with attention.\n",
+    "    Sequence of following blocks\n",
+    "    1. Upsample\n",
+    "    1. Concatenate Down block output\n",
+    "    2. Resnet block with time embedding\n",
+    "    3. Attention Block\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        in_channels,\n",
+    "        out_channels,\n",
+    "        t_emb_dim,\n",
+    "        up_sample,\n",
+    "        num_heads,\n",
+    "        num_layers,\n",
+    "        attn,\n",
+    "        norm_channels,\n",
+    "    ):\n",
+    "        super().__init__()\n",
+    "        self.num_layers = num_layers\n",
+    "        self.up_sample = up_sample\n",
+    "        self.t_emb_dim = t_emb_dim\n",
+    "        self.attn = attn\n",
+    "        self.resnet_conv_first = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Sequential(\n",
+    "                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),\n",
+    "                    nn.SiLU(),\n",
+    "                    nn.Conv2d(\n",
+    "                        in_channels if i == 0 else out_channels,\n",
+    "                        out_channels,\n",
+    "                        kernel_size=3,\n",
+    "                        stride=1,\n",
+    "                        padding=1,\n",
+    "                    ),\n",
+    "                )\n",
+    "                for i in range(num_layers)\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "        if self.t_emb_dim is not None:\n",
+    "            self.t_emb_layers = nn.ModuleList(\n",
+    "                [\n",
+    "                    nn.Sequential(nn.SiLU(), nn.Linear(t_emb_dim, out_channels))\n",
+    "                    for _ in range(num_layers)\n",
+    "                ]\n",
+    "            )\n",
+    "        self.resnet_conv_second = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Sequential(\n",
+    "                    nn.GroupNorm(norm_channels, out_channels),\n",
+    "                    nn.SiLU(),\n",
+    "                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),\n",
+    "                )\n",
+    "                for _ in range(num_layers)\n",
+    "            ]\n",
+    "        )\n",
+    "        if self.attn:\n",
+    "            self.attention_norms = nn.ModuleList(\n",
+    "                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]\n",
+    "            )\n",
+    "\n",
+    "            self.attentions = nn.ModuleList(\n",
+    "                [\n",
+    "                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)\n",
+    "                    for _ in range(num_layers)\n",
+    "                ]\n",
+    "            )\n",
+    "        self.residual_input_conv = nn.ModuleList(\n",
+    "            [\n",
+    "                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)\n",
+    "                for i in range(num_layers)\n",
+    "            ]\n",
+    "        )\n",
+    "        self.up_sample_conv = (\n",
+    "            nn.ConvTranspose2d(in_channels, in_channels, 4, 2, 1)\n",
+    "            if self.up_sample\n",
+    "            else nn.Identity()\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x, out_down=None, t_emb=None):\n",
+    "        # Upsample\n",
+    "\n",
+    "        x = self.up_sample_conv(x)\n",
+    "\n",
+    "        # Concat with Downblock output\n",
+    "\n",
+    "        if out_down is not None:\n",
+    "            x = torch.cat([x, out_down], dim=1)\n",
+    "        out = x\n",
+    "        for i in range(self.num_layers):\n",
+    "            # Resnet Block\n",
+    "\n",
+    "            resnet_input = out\n",
+    "            out = self.resnet_conv_first[i](out)\n",
+    "            if self.t_emb_dim is not None:\n",
+    "                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]\n",
+    "            out = self.resnet_conv_second[i](out)\n",
+    "            out = out + self.residual_input_conv[i](resnet_input)\n",
+    "\n",
+    "            # Self Attention\n",
+    "\n",
+    "            if self.attn:\n",
+    "                batch_size, channels, h, w = out.shape\n",
+    "                in_attn = out.reshape(batch_size, channels, h * w)\n",
+    "                in_attn = self.attention_norms[i](in_attn)\n",
+    "                in_attn = in_attn.transpose(1, 2)\n",
+    "                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)\n",
+    "                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)\n",
+    "                out = out + out_attn\n",
+    "        return out\n",
+    "\n",
+    "\n",
+    "class VQVAE(nn.Module):\n",
+    "    def __init__(self, im_channels, model_config):\n",
+    "        super().__init__()\n",
+    "        self.down_channels = model_config.down_channels\n",
+    "        self.mid_channels = model_config.mid_channels\n",
+    "        self.down_sample = model_config.down_sample\n",
+    "        self.num_down_layers = model_config.num_down_layers\n",
+    "        self.num_mid_layers = model_config.num_mid_layers\n",
+    "        self.num_up_layers = model_config.num_up_layers\n",
+    "\n",
+    "        # To disable attention in Downblock of Encoder and Upblock of Decoder\n",
+    "        self.attns = model_config.attn_down\n",
+    "\n",
+    "        # Latent Dimension\n",
+    "        self.z_channels = model_config.z_channels\n",
+    "        self.codebook_size = model_config.codebook_size\n",
+    "        self.norm_channels = model_config.norm_channels\n",
+    "        self.num_heads = model_config.num_heads\n",
+    "\n",
+    "        # Assertion to validate the channel information\n",
+    "        assert self.mid_channels[0] == self.down_channels[-1]\n",
+    "        assert self.mid_channels[-1] == self.down_channels[-1]\n",
+    "        assert len(self.down_sample) == len(self.down_channels) - 1\n",
+    "        assert len(self.attns) == len(self.down_channels) - 1\n",
+    "\n",
+    "        # Wherever we use downsampling in encoder correspondingly use\n",
+    "        # upsampling in decoder\n",
+    "        self.up_sample = list(reversed(self.down_sample))\n",
+    "\n",
+    "        ##################### Encoder ######################\n",
+    "        self.encoder_conv_in = nn.Conv2d(\n",
+    "            im_channels, self.down_channels[0], kernel_size=3, padding=(1, 1)\n",
+    "        )\n",
+    "\n",
+    "        # Downblock + Midblock\n",
+    "        self.encoder_layers = nn.ModuleList([])\n",
+    "        for i in range(len(self.down_channels) - 1):\n",
+    "            self.encoder_layers.append(\n",
+    "                DownBlock(\n",
+    "                    self.down_channels[i],\n",
+    "                    self.down_channels[i + 1],\n",
+    "                    t_emb_dim=None,\n",
+    "                    down_sample=self.down_sample[i],\n",
+    "                    num_heads=self.num_heads,\n",
+    "                    num_layers=self.num_down_layers,\n",
+    "                    attn=self.attns[i],\n",
+    "                    norm_channels=self.norm_channels,\n",
+    "                )\n",
+    "            )\n",
+    "        self.encoder_mids = nn.ModuleList([])\n",
+    "        for i in range(len(self.mid_channels) - 1):\n",
+    "            self.encoder_mids.append(\n",
+    "                MidBlock(\n",
+    "                    self.mid_channels[i],\n",
+    "                    self.mid_channels[i + 1],\n",
+    "                    t_emb_dim=None,\n",
+    "                    num_heads=self.num_heads,\n",
+    "                    num_layers=self.num_mid_layers,\n",
+    "                    norm_channels=self.norm_channels,\n",
+    "                )\n",
+    "            )\n",
+    "        self.encoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[-1])\n",
+    "        self.encoder_conv_out = nn.Conv2d(\n",
+    "            self.down_channels[-1], self.z_channels, kernel_size=3, padding=1\n",
+    "        )\n",
+    "\n",
+    "        # Pre Quantization Convolution\n",
+    "        self.pre_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)\n",
+    "\n",
+    "        # Codebook\n",
+    "        self.embedding = nn.Embedding(self.codebook_size, self.z_channels)\n",
+    "        ####################################################\n",
+    "\n",
+    "        ##################### Decoder ######################\n",
+    "\n",
+    "        # Post Quantization Convolution\n",
+    "        self.post_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)\n",
+    "        self.decoder_conv_in = nn.Conv2d(\n",
+    "            self.z_channels, self.mid_channels[-1], kernel_size=3, padding=(1, 1)\n",
+    "        )\n",
+    "\n",
+    "        # Midblock + Upblock\n",
+    "        self.decoder_mids = nn.ModuleList([])\n",
+    "        for i in reversed(range(1, len(self.mid_channels))):\n",
+    "            self.decoder_mids.append(\n",
+    "                MidBlock(\n",
+    "                    self.mid_channels[i],\n",
+    "                    self.mid_channels[i - 1],\n",
+    "                    t_emb_dim=None,\n",
+    "                    num_heads=self.num_heads,\n",
+    "                    num_layers=self.num_mid_layers,\n",
+    "                    norm_channels=self.norm_channels,\n",
+    "                )\n",
+    "            )\n",
+    "        self.decoder_layers = nn.ModuleList([])\n",
+    "        for i in reversed(range(1, len(self.down_channels))):\n",
+    "            self.decoder_layers.append(\n",
+    "                UpBlock(\n",
+    "                    self.down_channels[i],\n",
+    "                    self.down_channels[i - 1],\n",
+    "                    t_emb_dim=None,\n",
+    "                    up_sample=self.down_sample[i - 1],\n",
+    "                    num_heads=self.num_heads,\n",
+    "                    num_layers=self.num_up_layers,\n",
+    "                    attn=self.attns[i - 1],\n",
+    "                    norm_channels=self.norm_channels,\n",
+    "                )\n",
+    "            )\n",
+    "        self.decoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[0])\n",
+    "        self.decoder_conv_out = nn.Conv2d(\n",
+    "            self.down_channels[0], im_channels, kernel_size=3, padding=1\n",
+    "        )\n",
+    "\n",
+    "    def quantize(self, x):\n",
+    "        B, C, H, W = x.shape\n",
+    "\n",
+    "        # B, C, H, W -> B, H, W, C\n",
+    "        x = x.permute(0, 2, 3, 1)\n",
+    "\n",
+    "        # B, H, W, C -> B, H*W, C\n",
+    "        x = x.reshape(x.size(0), -1, x.size(-1))\n",
+    "\n",
+    "        # Find nearest embedding/codebook vector\n",
+    "        # dist between (B, H*W, C) and (B, K, C) -> (B, H*W, K)\n",
+    "        dist = torch.cdist(x, self.embedding.weight[None, :].repeat((x.size(0), 1, 1)))\n",
+    "        # (B, H*W)\n",
+    "        min_encoding_indices = torch.argmin(dist, dim=-1)\n",
+    "\n",
+    "        # Replace encoder output with nearest codebook\n",
+    "        # quant_out -> B*H*W, C\n",
+    "        quant_out = torch.index_select(self.embedding.weight, 0, min_encoding_indices.view(-1))\n",
+    "\n",
+    "        # x -> B*H*W, C\n",
+    "        x = x.reshape((-1, x.size(-1)))\n",
+    "        commmitment_loss = torch.mean((quant_out.detach() - x) ** 2)\n",
+    "        codebook_loss = torch.mean((quant_out - x.detach()) ** 2)\n",
+    "        quantize_losses = {\"codebook_loss\": codebook_loss, \"commitment_loss\": commmitment_loss}\n",
+    "        # Straight through estimation\n",
+    "        quant_out = x + (quant_out - x).detach()\n",
+    "\n",
+    "        # quant_out -> B, C, H, W\n",
+    "        quant_out = quant_out.reshape((B, H, W, C)).permute(0, 3, 1, 2)\n",
+    "        min_encoding_indices = min_encoding_indices.reshape(\n",
+    "            (-1, quant_out.size(-2), quant_out.size(-1))\n",
+    "        )\n",
+    "        return quant_out, quantize_losses, min_encoding_indices\n",
+    "\n",
+    "    def encode(self, x):\n",
+    "        out = self.encoder_conv_in(x)\n",
+    "        for idx, down in enumerate(self.encoder_layers):\n",
+    "            out = down(out)\n",
+    "        for mid in self.encoder_mids:\n",
+    "            out = mid(out)\n",
+    "        out = self.encoder_norm_out(out)\n",
+    "        out = nn.SiLU()(out)\n",
+    "        out = self.encoder_conv_out(out)\n",
+    "        out = self.pre_quant_conv(out)\n",
+    "        out, quant_losses, _ = self.quantize(out)\n",
+    "        return out, quant_losses\n",
+    "\n",
+    "    def decode(self, z):\n",
+    "        out = z\n",
+    "        out = self.post_quant_conv(out)\n",
+    "        out = self.decoder_conv_in(out)\n",
+    "        for mid in self.decoder_mids:\n",
+    "            out = mid(out)\n",
+    "        for idx, up in enumerate(self.decoder_layers):\n",
+    "            out = up(out)\n",
+    "        out = self.decoder_norm_out(out)\n",
+    "        out = nn.SiLU()(out)\n",
+    "        out = self.decoder_conv_out(out)\n",
+    "        return out\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        z, quant_losses = self.encode(x)\n",
+    "        out = self.decode(z)\n",
+    "        return out, z, quant_losses"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_path = \"/home/23m1521/ashish/MTP/LDM/scripts/config.yaml\"\n",
+    "with open(config_path, 'r') as file:\n",
+    "    Config = yaml.safe_load(file)\n",
+    "\n",
+    "Config = DotDict.from_dict(Config)\n",
+    "dataset_config = Config.dataset_params\n",
+    "diffusion_config = Config.diffusion_params\n",
+    "model_config = Config.model_params\n",
+    "train_config = Config.train_params"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### MNIST Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files found: 70000\n"
+     ]
+    }
+   ],
+   "source": [
+    "datadir = r\"/home/23m1521/datasets/mnist_images/data\"\n",
+    "\n",
+    "def walkDIR(folder_path, include=None):\n",
+    "    file_list = []\n",
+    "    for root, _, files in os.walk(folder_path):\n",
+    "        for file in files:\n",
+    "            if include is None or any(file.endswith(ext) for ext in include):\n",
+    "                file_list.append(os.path.join(root, file))\n",
+    "    print(\"Files found:\", len(file_list))\n",
+    "    return file_list\n",
+    "\n",
+    "files = walkDIR(datadir, include=['.png', '.jpeg', '.jpg'])\n",
+    "df = pd.DataFrame(files, columns=['image_path'])\n",
+    "df['id'] = df['image_path'].apply(lambda x: os.path.basename(x))\n",
+    "df['label'] = df['image_path'].apply(lambda x: os.path.dirname(x).split(\"/\")[-1])\n",
+    "df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n",
+    "\n",
+    "\n",
+    "class MnistDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        data,\n",
+    "        im_size\n",
+    "    ):\n",
+    "        if isinstance(data, str):\n",
+    "            self.data = pd.read_csv(data)\n",
+    "        elif isinstance(data, pd.DataFrame):\n",
+    "            self.data = data\n",
+    "        else:\n",
+    "            raise ValueError(\"The `data` argument must be a string (CSV file path) or a Pandas DataFrame.\")\n",
+    "        \n",
+    "        self.im_size = im_size\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        row = self.data.iloc[idx]\n",
+    "        image_path = row['image_path']\n",
+    "        label = int(row['label'])\n",
+    "\n",
+    "        image = tv.io.decode_image(image_path, mode='RGB')\n",
+    "        image = v2.Resize(self.im_size)(image)\n",
+    "        image = v2.ToDtype(torch.float32, scale=True)(image)\n",
+    "        image = 2*image - 1\n",
+    "\n",
+    "        return image, label\n",
+    "\n",
+    "\n",
+    "dataset = MnistDataset(df, im_size=dataset_config.im_size)\n",
+    "dataloader = torch.utils.data.DataLoader(\n",
+    "    dataset, \n",
+    "    batch_size=train_config.autoencoder_batch_size, \n",
+    "    shuffle=True, \n",
+    "    num_workers=os.cpu_count(),\n",
+    "    pin_memory=True,\n",
+    "    drop_last=True,\n",
+    "    persistent_workers=True\n",
+    ")\n",
+    "\n",
+    "# for batch in tqdm(dataloader):\n",
+    "#     images, labels = batch\n",
+    "\n",
+    "images, labels = next(iter(dataloader))\n",
+    "images, labels = images.to(device), labels.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(torch.Size([32, 3, 28, 28]),\n",
+       " torch.Size([32, 3, 7, 7]),\n",
+       " {'codebook_loss': tensor(0.1057, device='cuda:0', grad_fn=<MeanBackward0>),\n",
+       "  'commitment_loss': tensor(0.1057, device='cuda:0', grad_fn=<MeanBackward0>)})"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset_config = Config.dataset_params\n",
+    "autoencoder_config = Config.autoencoder_params\n",
+    "train_config = Config.train_params\n",
+    "\n",
+    "model = VQVAE(im_channels=dataset_config.im_channels, model_config=autoencoder_config).to(device)\n",
+    "\n",
+    "model_output = model(images)\n",
+    "model_output[0].shape, model_output[1].shape, model_output[2]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### VQVAE Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_checkpoint(\n",
+    "    total_steps, epoch, model, discriminator, optimizer_d, optimizer_g, metrics, checkpoint_path\n",
+    "):\n",
+    "    checkpoint = {\n",
+    "        \"total_steps\": total_steps,\n",
+    "        \"epoch\": epoch,\n",
+    "        \"model_state_dict\": model.state_dict(),\n",
+    "        \"discriminator_state_dict\": discriminator.state_dict(),\n",
+    "        \"optimizer_d_state_dict\": optimizer_d.state_dict(),\n",
+    "        \"optimizer_g_state_dict\": optimizer_g.state_dict(),\n",
+    "        \"metrics\": metrics,  # Save all metrics\n",
+    "    }\n",
+    "    torch.save(checkpoint, checkpoint_path)\n",
+    "    print(f\"Checkpoint saved after {total_steps} steps at epoch {epoch}\")\n",
+    "\n",
+    "\n",
+    "def load_checkpoint(checkpoint_path, model, discriminator, optimizer_d, optimizer_g):\n",
+    "    if os.path.exists(checkpoint_path):\n",
+    "        checkpoint = torch.load(checkpoint_path, map_location=device)\n",
+    "        model.load_state_dict(checkpoint[\"model_state_dict\"])\n",
+    "        discriminator.load_state_dict(checkpoint[\"discriminator_state_dict\"])\n",
+    "        optimizer_d.load_state_dict(checkpoint[\"optimizer_d_state_dict\"])\n",
+    "        optimizer_g.load_state_dict(checkpoint[\"optimizer_g_state_dict\"])\n",
+    "        total_steps = checkpoint[\"total_steps\"]\n",
+    "        epoch = checkpoint[\"epoch\"]\n",
+    "        metrics = checkpoint[\"metrics\"]\n",
+    "        print(f\"Checkpoint loaded. Resuming from epoch {epoch + 1}, step {total_steps}\")\n",
+    "        return total_steps, epoch + 1, metrics\n",
+    "    else:\n",
+    "        print(\"No checkpoint found. Starting from scratch.\")\n",
+    "        return 0, 0, None\n",
+    "\n",
+    "\n",
+    "def trainVAE(Config, dataloader):\n",
+    "\n",
+    "    # --- Configurations ----------------------------------------------------\n",
+    "    dataset_config = Config.dataset_params\n",
+    "    autoencoder_config = Config.autoencoder_params\n",
+    "    train_config = Config.train_params\n",
+    "\n",
+    "    seed = train_config.seed\n",
+    "    torch.manual_seed(seed)\n",
+    "    np.random.seed(seed)\n",
+    "    random.seed(seed)\n",
+    "    if device == \"cuda\":\n",
+    "        torch.cuda.manual_seed_all(seed)\n",
+    "        \n",
+    "        \n",
+    "    # --- Model Initilization ------------------------------------------------\n",
+    "    model = VQVAE(im_channels=dataset_config.im_channels, model_config=autoencoder_config).to(device)\n",
+    "    discriminator = Discriminator(im_channels=dataset_config.im_channels).to(device)\n",
+    "\n",
+    "    \n",
+    "    # --- Optimizer Initilization ----------------------------------------------\n",
+    "    optimizer_d = torch.optim.AdamW(discriminator.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))\n",
+    "    optimizer_g = torch.optim.AdamW(model.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))\n",
+    "    \n",
+    "    \n",
+    "    # --- Checkpoint Loading ------------------------------------------------\n",
+    "    checkpoint_path = os.path.join(train_config.task_name, \"checkpoint.pth\")\n",
+    "    total_steps, start_epoch, metrics = load_checkpoint(checkpoint_path, model, discriminator, optimizer_d, optimizer_g)\n",
+    "    if os.path.exists(\n",
+    "        os.path.join(train_config.task_name, train_config.vqvae_autoencoder_ckpt_name)\n",
+    "    ):\n",
+    "        print(\"Loaded vae checkpoint\")\n",
+    "        model.load_state_dict(\n",
+    "            torch.load(\n",
+    "                os.path.join(train_config.task_name, train_config.vqvae_autoencoder_ckpt_name),\n",
+    "                map_location=device,\n",
+    "                weights_only=True,\n",
+    "            )\n",
+    "        )\n",
+    "        \n",
+    "    if os.path.exists(\n",
+    "        os.path.join(train_config.task_name, train_config.vqvae_discriminator_ckpt_name)\n",
+    "    ):\n",
+    "        print(\"Loaded discriminator checkpoint\")\n",
+    "        discriminator.load_state_dict(\n",
+    "            torch.load(\n",
+    "                os.path.join(train_config.task_name, train_config.vqvae_discriminator_ckpt_name),\n",
+    "                map_location=device,\n",
+    "                weights_only=True,\n",
+    "            )\n",
+    "        )\n",
+    "        \n",
+    "        \n",
+    "        \n",
+    "    # --- Loss Function Initilization ----------------------------------------\n",
+    "    if not os.path.exists(train_config.task_name):\n",
+    "        os.mkdir(train_config.task_name)\n",
+    "    num_epochs = train_config.autoencoder_epochs\n",
+    "\n",
+    "    # L1/L2 loss for Reconstruction\n",
+    "    recon_criterion = torch.nn.MSELoss()\n",
+    "    disc_criterion = torch.nn.MSELoss()\n",
+    "\n",
+    "    # LPIPS loss for perceptual similarity\n",
+    "    lpips_model = LPIPS().eval().to(device)\n",
+    "\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "    disc_step_start = train_config.disc_start\n",
+    "    step_count = 0\n",
+    "\n",
+    "    # This is for accumulating gradients incase the images are huge\n",
+    "    # And one cant afford higher batch sizes\n",
+    "\n",
+    "    acc_steps = train_config.autoencoder_acc_steps\n",
+    "    image_save_steps = train_config.autoencoder_img_save_steps\n",
+    "    img_save_count = 0\n",
+    "\n",
+    "    for epoch_idx in trange(num_epochs, desc=\"Training VQVAE\"):\n",
+    "        recon_losses = []\n",
+    "        codebook_losses = []\n",
+    "        # commitment_losses = []\n",
+    "\n",
+    "        perceptual_losses = []\n",
+    "        disc_losses = []\n",
+    "        gen_losses = []\n",
+    "        losses = []\n",
+    "\n",
+    "        optimizer_g.zero_grad()\n",
+    "        optimizer_d.zero_grad()\n",
+    "\n",
+    "        # for images in tqdm(dataloader):\n",
+    "        for images in dataloader:\n",
+    "            step_count += 1\n",
+    "            images = images.to(device)\n",
+    "\n",
+    "            # Fetch autoencoders output(reconstructions)\n",
+    "            model_output = model(images)\n",
+    "            output, z, quantize_losses = model_output\n",
+    "\n",
+    "            # Image Saving Logic\n",
+    "            if step_count % image_save_steps == 0 or step_count == 1:\n",
+    "                sample_size = min(8, images.shape[0])\n",
+    "                save_output = torch.clamp(output[:sample_size], -1.0, 1.0).detach().cpu()\n",
+    "                save_output = (save_output + 1) / 2\n",
+    "                save_input = ((images[:sample_size] + 1) / 2).detach().cpu()\n",
+    "\n",
+    "                grid = make_grid(torch.cat([save_input, save_output], dim=0), nrow=sample_size)\n",
+    "                img = tv.transforms.ToPILImage()(grid)\n",
+    "                if not os.path.exists(\n",
+    "                    os.path.join(train_config.task_name, \"vqvae_autoencoder_samples\")\n",
+    "                ):\n",
+    "                    os.mkdir(os.path.join(train_config.task_name, \"vqvae_autoencoder_samples\"))\n",
+    "                img.save(\n",
+    "                    os.path.join(\n",
+    "                        train_config.task_name,\n",
+    "                        \"vqvae_autoencoder_samples\",\n",
+    "                        \"current_autoencoder_sample_{}.png\".format(img_save_count),\n",
+    "                    )\n",
+    "                )\n",
+    "                img_save_count += 1\n",
+    "                img.close()\n",
+    "                \n",
+    "                \n",
+    "            ######### Optimize Generator ##########\n",
+    "            # L2 Loss for Reconstruction\n",
+    "            recon_loss = recon_criterion(output, images)\n",
+    "            recon_losses.append(recon_loss.item())\n",
+    "            recon_loss = recon_loss / acc_steps\n",
+    "            \n",
+    "            # Generator Loss =\n",
+    "            g_loss = (\n",
+    "                recon_loss\n",
+    "                + (train_config.codebook_weight * quantize_losses[\"codebook_loss\"] / acc_steps)\n",
+    "                + (train_config.commitment_beta * quantize_losses[\"commitment_loss\"] / acc_steps)\n",
+    "            )\n",
+    "            \n",
+    "            codebook_losses.append(\n",
+    "                train_config.codebook_weight * quantize_losses[\"codebook_loss\"].item()\n",
+    "            )\n",
+    "            \n",
+    "\n",
+    "            # Adversarial loss only if disc_step_start steps passed\n",
+    "            if step_count > disc_step_start:\n",
+    "                disc_fake_pred = discriminator(model_output[0])\n",
+    "                disc_fake_loss = disc_criterion(\n",
+    "                    disc_fake_pred,\n",
+    "                    torch.ones(disc_fake_pred.shape, device=disc_fake_pred.device),\n",
+    "                )\n",
+    "                gen_losses.append(train_config.disc_weight * disc_fake_loss.item())\n",
+    "                g_loss += train_config.disc_weight * disc_fake_loss / acc_steps\n",
+    "            lpips_loss = torch.mean(lpips_model(output, images)) / acc_steps\n",
+    "            perceptual_losses.append(train_config.perceptual_weight * lpips_loss.item())\n",
+    "            g_loss += train_config.perceptual_weight * lpips_loss / acc_steps\n",
+    "            losses.append(g_loss.item())\n",
+    "            g_loss.backward()\n",
+    "            #####################################\n",
+    "\n",
+    "\n",
+    "            ######### Optimize Discriminator #######\n",
+    "            if step_count > disc_step_start:\n",
+    "                fake = output\n",
+    "                disc_fake_pred = discriminator(fake.detach())\n",
+    "                disc_real_pred = discriminator(images)\n",
+    "                disc_fake_loss = disc_criterion(\n",
+    "                    disc_fake_pred,\n",
+    "                    torch.zeros(disc_fake_pred.shape, device=disc_fake_pred.device),\n",
+    "                )\n",
+    "                disc_real_loss = disc_criterion(\n",
+    "                    disc_real_pred,\n",
+    "                    torch.ones(disc_real_pred.shape, device=disc_real_pred.device),\n",
+    "                )\n",
+    "                disc_loss = train_config.disc_weight * (disc_fake_loss + disc_real_loss) / 2\n",
+    "                disc_losses.append(disc_loss.item())\n",
+    "                disc_loss = disc_loss / acc_steps\n",
+    "                disc_loss.backward()\n",
+    "                if step_count % acc_steps == 0:\n",
+    "                    optimizer_d.step()\n",
+    "                    optimizer_d.zero_grad()\n",
+    "            #####################################\n",
+    "\n",
+    "            if step_count % acc_steps == 0:\n",
+    "                optimizer_g.step()\n",
+    "                optimizer_g.zero_grad()\n",
+    "        optimizer_d.step()\n",
+    "        optimizer_d.zero_grad()\n",
+    "        optimizer_g.step()\n",
+    "        optimizer_g.zero_grad()\n",
+    "        if len(disc_losses) > 0:\n",
+    "            print(\n",
+    "                \"Finished epoch: {}/{} | Recon Loss : {:.4f} | Perceptual Loss : {:.4f} | \"\n",
+    "                \"Codebook : {:.4f} | G Loss : {:.4f} | D Loss {:.4f}\".format(\n",
+    "                    epoch_idx + 1,\n",
+    "                    num_epochs,\n",
+    "                    np.mean(recon_losses),\n",
+    "                    np.mean(perceptual_losses),\n",
+    "                    np.mean(codebook_losses),\n",
+    "                    np.mean(gen_losses),\n",
+    "                    np.mean(disc_losses),\n",
+    "                )\n",
+    "            )\n",
+    "        else:\n",
+    "            print(\n",
+    "                \"Finished epoch: {}/{} | Recon Loss : {:.4f} | Perceptual Loss : {:.4f} | Codebook : {:.4f}\".format(\n",
+    "                    epoch_idx + 1,\n",
+    "                    num_epochs,\n",
+    "                    np.mean(recon_losses),\n",
+    "                    np.mean(perceptual_losses),\n",
+    "                    np.mean(codebook_losses),\n",
+    "                )\n",
+    "            )\n",
+    "        torch.save(\n",
+    "            model.state_dict(),\n",
+    "            os.path.join(train_config.task_name, train_config.vqvae_autoencoder_ckpt_name),\n",
+    "        )\n",
+    "        torch.save(\n",
+    "            discriminator.state_dict(),\n",
+    "            os.path.join(train_config.task_name, train_config.vqvae_discriminator_ckpt_name),\n",
+    "        )\n",
+    "    print(\"Done Training...\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# trainVAE(Config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_checkpoint(\n",
+    "    total_steps, epoch, model, discriminator, optimizer_d, optimizer_g, metrics, checkpoint_path\n",
+    "):\n",
+    "    checkpoint = {\n",
+    "        \"total_steps\": total_steps,\n",
+    "        \"epoch\": epoch,\n",
+    "        \"model_state_dict\": model.state_dict(),\n",
+    "        \"discriminator_state_dict\": discriminator.state_dict(),\n",
+    "        \"optimizer_d_state_dict\": optimizer_d.state_dict(),\n",
+    "        \"optimizer_g_state_dict\": optimizer_g.state_dict(),\n",
+    "        \"metrics\": metrics,  # Save all metrics\n",
+    "    }\n",
+    "    torch.save(checkpoint, checkpoint_path)\n",
+    "    print(f\"Checkpoint saved after {total_steps} steps at epoch {epoch}\")\n",
+    "\n",
+    "\n",
+    "def load_checkpoint(checkpoint_path, model, discriminator, optimizer_d, optimizer_g):\n",
+    "    if os.path.exists(checkpoint_path):\n",
+    "        checkpoint = torch.load(checkpoint_path, map_location=device)\n",
+    "        model.load_state_dict(checkpoint[\"model_state_dict\"])\n",
+    "        discriminator.load_state_dict(checkpoint[\"discriminator_state_dict\"])\n",
+    "        optimizer_d.load_state_dict(checkpoint[\"optimizer_d_state_dict\"])\n",
+    "        optimizer_g.load_state_dict(checkpoint[\"optimizer_g_state_dict\"])\n",
+    "        total_steps = checkpoint[\"total_steps\"]\n",
+    "        epoch = checkpoint[\"epoch\"]\n",
+    "        metrics = checkpoint[\"metrics\"]\n",
+    "        print(f\"Checkpoint loaded. Resuming from epoch {epoch + 1}, step {total_steps}\")\n",
+    "        return total_steps, epoch + 1, metrics\n",
+    "    else:\n",
+    "        print(\"No checkpoint found. Starting from scratch.\")\n",
+    "        return 0, 0, None\n",
+    "\n",
+    "\n",
+    "def trainVAE(Config, dataloader):\n",
+    "\n",
+    "    # --- Configurations ----------------------------------------------------\n",
+    "    dataset_config = Config.dataset_params\n",
+    "    autoencoder_config = Config.autoencoder_params\n",
+    "    train_config = Config.train_params\n",
+    "\n",
+    "    seed = train_config.seed\n",
+    "    torch.manual_seed(seed)\n",
+    "    np.random.seed(seed)\n",
+    "    random.seed(seed)\n",
+    "    if device == \"cuda\":\n",
+    "        torch.cuda.manual_seed_all(seed)\n",
+    "        \n",
+    "        \n",
+    "    # --- Model Initilization ------------------------------------------------\n",
+    "    model = VQVAE(im_channels=dataset_config.im_channels, model_config=autoencoder_config).to(device)\n",
+    "    discriminator = Discriminator(im_channels=dataset_config.im_channels).to(device)\n",
+    "\n",
+    "    \n",
+    "    # --- Optimizer Initilization ----------------------------------------------\n",
+    "    optimizer_d = torch.optim.AdamW(discriminator.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))\n",
+    "    optimizer_g = torch.optim.AdamW(model.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))\n",
+    "        \n",
+    "        \n",
+    "    # --- Loss Function Initialization --------------------------------------\n",
+    "    recon_criterion = torch.nn.MSELoss()\n",
+    "    # disc_criterion = torch.nn.MSELoss()\n",
+    "    disc_criterion = torch.nn.BCEWithLogits()\n",
+    "    lpips_model = LPIPS().eval().to(device)\n",
+    "\n",
+    "        \n",
+    "    # --- Training Loop -----------------------------------------------------\n",
+    "    step_count = 0\n",
+    "    num_epochs = train_config.autoencoder_epochs\n",
+    "    disc_step_start = train_config.disc_start\n",
+    "    acc_steps = train_config.autoencoder_acc_steps\n",
+    "    image_save_steps = train_config.autoencoder_img_save_steps\n",
+    "    img_save_count = 0\n",
+    "    start_epoch = 0\n",
+    "\n",
+    "    for epoch_idx in range(start_epoch, num_epochs):\n",
+    "        recon_losses = []\n",
+    "        codebook_losses = []\n",
+    "        perceptual_losses = []\n",
+    "        \n",
+    "        disc_losses = []\n",
+    "        gen_losses = []\n",
+    "        losses = []\n",
+    "\n",
+    "        optimizer_g.zero_grad()\n",
+    "        optimizer_d.zero_grad()\n",
+    "\n",
+    "        for images in dataloader:\n",
+    "            step_count += 1\n",
+    "            images = images.to(device)\n",
+    "\n",
+    "            model_output = model(images)\n",
+    "            output, z, quantize_losses = model_output\n",
+    "                \n",
+    "            \n",
+    "            # --- Reconstruction Loss ---------------------------------------------------------\n",
+    "            recon_loss = recon_criterion(output, images)\n",
+    "            recon_losses.append(recon_loss.item())\n",
+    "            recon_loss = recon_loss / acc_steps\n",
+    "            \n",
+    "            # --- CodeBook Loss ---------------------------------------------------------------\n",
+    "            codebook_losses.append(train_config.codebook_weight * quantize_losses[\"codebook_loss\"].item())\n",
+    "            \n",
+    "            # --- Perceptual Loss -------------------------------------------------------------\n",
+    "            lpips_loss = torch.mean(lpips_model(output, images)) / acc_steps\n",
+    "            perceptual_losses.append(train_config.perceptual_weight * lpips_loss.item())\n",
+    "            \n",
+    "            \n",
+    "            g_loss = (\n",
+    "                recon_loss\n",
+    "                + (train_config.codebook_weight * quantize_losses[\"codebook_loss\"] / acc_steps)\n",
+    "                + (train_config.commitment_beta * quantize_losses[\"commitment_loss\"] / acc_steps)\n",
+    "            )\n",
+    "            \n",
+    "\n",
+    "            # Adversarial loss only if disc_step_start steps passed\n",
+    "            if step_count > disc_step_start:\n",
+    "                disc_fake_pred = discriminator(model_output[0])\n",
+    "                disc_fake_loss = disc_criterion(\n",
+    "                    disc_fake_pred,\n",
+    "                    torch.ones(disc_fake_pred.shape, device=disc_fake_pred.device),\n",
+    "                )\n",
+    "                gen_losses.append(train_config.disc_weight * disc_fake_loss.item())\n",
+    "                g_loss += train_config.disc_weight * disc_fake_loss / acc_steps\n",
+    "                \n",
+    "            \n",
+    "            \n",
+    "            g_loss += train_config.perceptual_weight * lpips_loss / acc_steps\n",
+    "            losses.append(g_loss.item())\n",
+    "            g_loss.backward()\n",
+    "\n",
+    "\n",
+    "            ######### Optimize Discriminator #######\n",
+    "            if step_count > disc_step_start:\n",
+    "                fake = output\n",
+    "                disc_fake_pred = discriminator(fake.detach())\n",
+    "                disc_real_pred = discriminator(images)\n",
+    "                disc_fake_loss = disc_criterion(\n",
+    "                    disc_fake_pred,\n",
+    "                    torch.zeros(disc_fake_pred.shape, device=disc_fake_pred.device),\n",
+    "                )\n",
+    "                disc_real_loss = disc_criterion(\n",
+    "                    disc_real_pred,\n",
+    "                    torch.ones(disc_real_pred.shape, device=disc_real_pred.device),\n",
+    "                )\n",
+    "                disc_loss = train_config.disc_weight * (disc_fake_loss + disc_real_loss) / 2\n",
+    "                disc_losses.append(disc_loss.item())\n",
+    "                disc_loss = disc_loss / acc_steps\n",
+    "                disc_loss.backward()\n",
+    "                if step_count % acc_steps == 0:\n",
+    "                    optimizer_d.step()\n",
+    "                    optimizer_d.zero_grad()\n",
+    "            #####################################\n",
+    "\n",
+    "            if step_count % acc_steps == 0:\n",
+    "                optimizer_g.step()\n",
+    "                optimizer_g.zero_grad()\n",
+    "        optimizer_d.step()\n",
+    "        optimizer_d.zero_grad()\n",
+    "        optimizer_g.step()\n",
+    "        optimizer_g.zero_grad()\n",
+    "        if len(disc_losses) > 0:\n",
+    "            print(\n",
+    "                \"Finished epoch: {}/{} | Recon Loss : {:.4f} | Perceptual Loss : {:.4f} | \"\n",
+    "                \"Codebook : {:.4f} | G Loss : {:.4f} | D Loss {:.4f}\".format(\n",
+    "                    epoch_idx + 1,\n",
+    "                    num_epochs,\n",
+    "                    np.mean(recon_losses),\n",
+    "                    np.mean(perceptual_losses),\n",
+    "                    np.mean(codebook_losses),\n",
+    "                    np.mean(gen_losses),\n",
+    "                    np.mean(disc_losses),\n",
+    "                )\n",
+    "            )\n",
+    "        else:\n",
+    "            print(\n",
+    "                \"Finished epoch: {}/{} | Recon Loss : {:.4f} | Perceptual Loss : {:.4f} | Codebook : {:.4f}\".format(\n",
+    "                    epoch_idx + 1,\n",
+    "                    num_epochs,\n",
+    "                    np.mean(recon_losses),\n",
+    "                    np.mean(perceptual_losses),\n",
+    "                    np.mean(codebook_losses),\n",
+    "                )\n",
+    "            )\n",
+    "        torch.save(\n",
+    "            model.state_dict(),\n",
+    "            os.path.join(train_config.task_name, train_config.vqvae_autoencoder_ckpt_name),\n",
+    "        )\n",
+    "        torch.save(\n",
+    "            discriminator.state_dict(),\n",
+    "            os.path.join(train_config.task_name, train_config.vqvae_discriminator_ckpt_name),\n",
+    "        )\n",
+    "    print(\"Done Training...\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

LDM/notebooks/_2_Rough-LPIPS.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

LDM/scripts/Main.py ADDED Viewed

	@@ -0,0 +1,2273 @@

+# ==================================================================
+#         L A T E N T   D I F F U S I O N   M O D E L
+# ==================================================================
+# Author    : Ashish Kumar Uchadiya
+# Created   : November 3, 2024
+# Description: This script implements a Latent Diffusion Model using
+# a cosine or linear noise scheduling approach for high-resolution
+# image generation. The model leverages generative techniques to
+# learn a latent representation and progressively reduce noise to
+# generate clear, realistic images.
+# ==================================================================
+#                         I M P O R T S
+# ==================================================================
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+"""Lpips"""
+# from __future__ import absolute_import
+from collections import namedtuple
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from torch.autograd import Variable
+import numpy as np
+import torch.nn
+import torchvision
+# Taken from https://github.com/richzhang/PerceptualSimilarity/blob/master/lpips/lpips.py
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def spatial_average(in_tens, keepdim=True):
+    return in_tens.mean([2, 3], keepdim=keepdim)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = torchvision.models.vgg16(
+            weights=torchvision.models.VGG16_Weights.IMAGENET1K_V1
+        ).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        # Freeze vgg model
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        # Return output of vgg features
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+# Learned perceptual metric
+class LPIPS(nn.Module):
+    def __init__(self, net='vgg', version='0.1', use_dropout=True):
+        super(LPIPS, self).__init__()
+        self.version = version
+        # Imagenet normalization
+        self.scaling_layer = ScalingLayer()
+        ########################
+        # Instantiate vgg model
+        self.chns = [64, 128, 256, 512, 512]
+        self.L = len(self.chns)
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        # Add 1x1 convolutional Layers
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        self.lins = nn.ModuleList(self.lins)
+        ########################
+        # Load the weights of trained LPIPS model
+        import inspect
+        import os
+        # /home/taruntejaneurips23/.cache/torch/hub/checkpoints/vgg16-397923af.pth
+        print(os.path.abspath(os.path.join(inspect.getfile(self.__init__), '..', 'weights/v%s/%s.pth' % (version, net))))
+        # model_path = os.path.abspath(
+        #     os.path.join(inspect.getfile(self.__init__), '..', 'weights/v%s/%s.pth' % (version, net)))
+        # print('Loading model from: %s' % model_path)
+        # self.load_state_dict(torch.load(model_path, map_location=device), strict=False)
+        ########################
+        # Freeze all parameters
+        self.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+        ########################
+    def forward(self, in0, in1, normalize=False):
+        # Scale the inputs to -1 to +1 range if needed
+        if normalize:  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
+            in0 = 2 * in0 - 1
+            in1 = 2 * in1 - 1
+        ########################
+        # Normalize the inputs according to imagenet normalization
+        in0_input, in1_input = self.scaling_layer(in0), self.scaling_layer(in1)
+        ########################
+        # Get VGG outputs for image0 and image1
+        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        ########################
+        # Compute Square of Difference for each layer output
+        for kk in range(self.L):
+            feats0[kk], feats1[kk] = torch.nn.functional.normalize(outs0[kk], dim=1), torch.nn.functional.normalize(
+                outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        ########################
+        # 1x1 convolution followed by spatial average on the square differences
+        res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
+        val = 0
+        # Aggregate the results of each layer
+        for l in range(self.L):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        # Imagnet normalization for (0-1)
+        # mean = [0.485, 0.456, 0.406]
+        # std = [0.229, 0.224, 0.225]
+        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    ''' A single linear layer which does a 1x1 conv '''
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = [nn.Dropout(), ] if (use_dropout) else []
+        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        out = self.model(x)
+        return out
+"""Blocks"""
+import torch
+import numpy as np
+class LinearNoiseScheduler:
+    r"""
+    Class for the linear noise scheduler that is used in DDPM.
+    """
+    def __init__(self, num_timesteps, beta_start, beta_end):
+        self.num_timesteps = num_timesteps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        # Mimicking how compvis repo creates schedule
+        self.betas = (
+                torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_timesteps) ** 2
+        )
+        self.alphas = 1. - self.betas
+        self.alpha_cum_prod = torch.cumprod(self.alphas, dim=0)
+        self.sqrt_alpha_cum_prod = torch.sqrt(self.alpha_cum_prod)
+        self.sqrt_one_minus_alpha_cum_prod = torch.sqrt(1 - self.alpha_cum_prod)
+    def add_noise(self, original, noise, t):
+        r"""
+        Forward method for diffusion
+        :param original: Image on which noise is to be applied
+        :param noise: Random Noise Tensor (from normal dist)
+        :param t: timestep of the forward process of shape -> (B,)
+        :return:
+        """
+        original_shape = original.shape
+        batch_size = original_shape[0]
+        sqrt_alpha_cum_prod = self.sqrt_alpha_cum_prod.to(original.device)[t].reshape(batch_size)
+        sqrt_one_minus_alpha_cum_prod = self.sqrt_one_minus_alpha_cum_prod.to(original.device)[t].reshape(batch_size)
+        # Reshape till (B,) becomes (B,1,1,1) if image is (B,C,H,W)
+        for _ in range(len(original_shape) - 1):
+            sqrt_alpha_cum_prod = sqrt_alpha_cum_prod.unsqueeze(-1)
+        for _ in range(len(original_shape) - 1):
+            sqrt_one_minus_alpha_cum_prod = sqrt_one_minus_alpha_cum_prod.unsqueeze(-1)
+        # Apply and Return Forward process equation
+        return (sqrt_alpha_cum_prod.to(original.device) * original
+                + sqrt_one_minus_alpha_cum_prod.to(original.device) * noise)
+    def sample_prev_timestep(self, xt, noise_pred, t):
+        r"""
+            Use the noise prediction by model to get
+            xt-1 using xt and the nosie predicted
+        :param xt: current timestep sample
+        :param noise_pred: model noise prediction
+        :param t: current timestep we are at
+        :return:
+        """
+        x0 = ((xt - (self.sqrt_one_minus_alpha_cum_prod.to(xt.device)[t] * noise_pred)) /
+              torch.sqrt(self.alpha_cum_prod.to(xt.device)[t]))
+        x0 = torch.clamp(x0, -1., 1.)
+        mean = xt - ((self.betas.to(xt.device)[t]) * noise_pred) / (self.sqrt_one_minus_alpha_cum_prod.to(xt.device)[t])
+        mean = mean / torch.sqrt(self.alphas.to(xt.device)[t])
+        if t == 0:
+            return mean, x0
+        else:
+            variance = (1 - self.alpha_cum_prod.to(xt.device)[t - 1]) / (1.0 - self.alpha_cum_prod.to(xt.device)[t])
+            variance = variance * self.betas.to(xt.device)[t]
+            sigma = variance ** 0.5
+            z = torch.randn(xt.shape).to(xt.device)
+            # OR
+            # variance = self.betas[t]
+            # sigma = variance ** 0.5
+            # z = torch.randn(xt.shape).to(xt.device)
+            return mean + sigma * z, x0
+import torch
+import math
+class CosineNoiseScheduler:
+    r"""
+    Class for the cosine noise scheduler, often used in DDPM-based models.
+    """
+    def __init__(self, num_timesteps, s=0.008):
+        self.num_timesteps = num_timesteps
+        self.s = s
+        # Cosine schedule based on paper
+        def cosine_schedule(t):
+            return math.cos((t / self.num_timesteps + s) / (1 + s) * math.pi / 2) ** 2
+        # Compute alphas
+        self.alphas = torch.tensor([cosine_schedule(t) for t in range(num_timesteps)])
+        self.alpha_cum_prod = torch.cumprod(self.alphas, dim=0)
+        self.sqrt_alpha_cum_prod = torch.sqrt(self.alpha_cum_prod)
+        self.sqrt_one_minus_alpha_cum_prod = torch.sqrt(1 - self.alpha_cum_prod)
+    def add_noise(self, original, noise, t):
+        original_shape = original.shape
+        batch_size = original_shape[0]
+        sqrt_alpha_cum_prod = self.sqrt_alpha_cum_prod.to(original.device)[t].reshape(batch_size)
+        sqrt_one_minus_alpha_cum_prod = self.sqrt_one_minus_alpha_cum_prod.to(original.device)[t].reshape(batch_size)
+        for _ in range(len(original_shape) - 1):
+            sqrt_alpha_cum_prod = sqrt_alpha_cum_prod.unsqueeze(-1)
+        for _ in range(len(original_shape) - 1):
+            sqrt_one_minus_alpha_cum_prod = sqrt_one_minus_alpha_cum_prod.unsqueeze(-1)
+        return (sqrt_alpha_cum_prod * original + sqrt_one_minus_alpha_cum_prod * noise)
+    def sample_prev_timestep(self, xt, noise_pred, t):
+        x0 = ((xt - (self.sqrt_one_minus_alpha_cum_prod.to(xt.device)[t] * noise_pred)) /
+              torch.sqrt(self.alpha_cum_prod.to(xt.device)[t]))
+        x0 = torch.clamp(x0, -1., 1.)
+        mean = xt - ((1 - self.alphas.to(xt.device)[t]) * noise_pred) / (self.sqrt_one_minus_alpha_cum_prod.to(xt.device)[t])
+        mean = mean / torch.sqrt(self.alphas.to(xt.device)[t])
+        if t == 0:
+            return mean, x0
+        else:
+            variance = (1 - self.alpha_cum_prod.to(xt.device)[t - 1]) / (1.0 - self.alpha_cum_prod.to(xt.device)[t])
+            variance = variance * (1 - self.alphas.to(xt.device)[t])
+            sigma = variance ** 0.5
+            z = torch.randn(xt.shape).to(xt.device)
+            return mean + sigma * z, x0
+import torch
+import torch.nn as nn
+def get_time_embedding(time_steps, temb_dim):
+    r"""
+    Convert time steps tensor into an embedding using the
+    sinusoidal time embedding formula
+    :param time_steps: 1D tensor of length batch size
+    :param temb_dim: Dimension of the embedding
+    :return: BxD embedding representation of B time steps
+    """
+    assert temb_dim % 2 == 0, "time embedding dimension must be divisible by 2"
+    # factor = 10000^(2i/d_model)
+    factor = 10000 ** ((torch.arange(
+        start=0, end=temb_dim // 2, dtype=torch.float32, device=time_steps.device) / (temb_dim // 2))
+    )
+    # pos / factor
+    # timesteps B -> B, 1 -> B, temb_dim
+    t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor
+    t_emb = torch.cat([torch.sin(t_emb), torch.cos(t_emb)], dim=-1)
+    return t_emb
+class DownBlock(nn.Module):
+    r"""
+    Down conv block with attention.
+    Sequence of following block
+    1. Resnet block with time embedding
+    2. Attention block
+    3. Downsample
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim,
+                 down_sample, num_heads, num_layers, attn, norm_channels, cross_attn=False, context_dim=None):
+        super().__init__()
+        self.num_layers = num_layers
+        self.down_sample = down_sample
+        self.attn = attn
+        self.context_dim = context_dim
+        self.cross_attn = cross_attn
+        self.t_emb_dim = t_emb_dim
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels,
+                              kernel_size=3, stride=1, padding=1),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(self.t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers)
+            ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels,
+                              kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if self.attn:
+            self.attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers)
+            ]
+        )
+        self.down_sample_conv = nn.Conv2d(out_channels, out_channels,
+                                          4, 2, 1) if self.down_sample else nn.Identity()
+    def forward(self, x, t_emb=None, context=None):
+        out = x
+        for i in range(self.num_layers):
+            # Resnet block of Unet
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            if self.attn:
+                # Attention block of Unet
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+            if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        # Downsample
+        out = self.down_sample_conv(out)
+        return out
+class MidBlock(nn.Module):
+    r"""
+    Mid conv block with attention.
+    Sequence of following blocks
+    1. Resnet block with time embedding
+    2. Attention block
+    3. Resnet block with time embedding
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim, num_heads, num_layers, norm_channels, cross_attn=None, context_dim=None):
+        super().__init__()
+        self.num_layers = num_layers
+        self.t_emb_dim = t_emb_dim
+        self.context_dim = context_dim
+        self.cross_attn = cross_attn
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
+                              padding=1),
+                )
+                for i in range(num_layers + 1)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers + 1)
+            ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers + 1)
+            ]
+        )
+        self.attention_norms = nn.ModuleList(
+            [nn.GroupNorm(norm_channels, out_channels)
+             for _ in range(num_layers)]
+        )
+        self.attentions = nn.ModuleList(
+            [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+             for _ in range(num_layers)]
+        )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers + 1)
+            ]
+        )
+    def forward(self, x, t_emb=None, context=None):
+        out = x
+        # First resnet block
+        resnet_input = out
+        out = self.resnet_conv_first[0](out)
+        if self.t_emb_dim is not None:
+            out = out + self.t_emb_layers[0](t_emb)[:, :, None, None]
+        out = self.resnet_conv_second[0](out)
+        out = out + self.residual_input_conv[0](resnet_input)
+        for i in range(self.num_layers):
+            # Attention Block
+            batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
+            in_attn = self.attention_norms[i](in_attn)
+            in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+            out = out + out_attn
+            if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+            # Resnet Block
+            resnet_input = out
+            out = self.resnet_conv_first[i + 1](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i + 1](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i + 1](out)
+            out = out + self.residual_input_conv[i + 1](resnet_input)
+        return out
+class UpBlock(nn.Module):
+    r"""
+    Up conv block with attention.
+    Sequence of following blocks
+    1. Upsample
+    1. Concatenate Down block output
+    2. Resnet block with time embedding
+    3. Attention Block
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim,
+                 up_sample, num_heads, num_layers, attn, norm_channels):
+        super().__init__()
+        self.num_layers = num_layers
+        self.up_sample = up_sample
+        self.t_emb_dim = t_emb_dim
+        self.attn = attn
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
+                              padding=1),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers)
+            ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if self.attn:
+            self.attention_norms = nn.ModuleList(
+                [
+                    nn.GroupNorm(norm_channels, out_channels)
+                    for _ in range(num_layers)
+                ]
+            )
+            self.attentions = nn.ModuleList(
+                [
+                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                    for _ in range(num_layers)
+                ]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers)
+            ]
+        )
+        self.up_sample_conv = nn.ConvTranspose2d(in_channels, in_channels,
+                                                 4, 2, 1) \
+            if self.up_sample else nn.Identity()
+    def forward(self, x, out_down=None, t_emb=None):
+        # Upsample
+        x = self.up_sample_conv(x)
+        # Concat with Downblock output
+        if out_down is not None:
+            x = torch.cat([x, out_down], dim=1)
+        out = x
+        for i in range(self.num_layers):
+            # Resnet Block
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            # Self Attention
+            if self.attn:
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        return out
+class UpBlockUnet(nn.Module):
+    r"""
+    Up conv block with attention.
+    Sequence of following blocks
+    1. Upsample
+    1. Concatenate Down block output
+    2. Resnet block with time embedding
+    3. Attention Block
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim, up_sample,
+                 num_heads, num_layers, norm_channels, cross_attn=False, context_dim=None):
+        super().__init__()
+        self.num_layers = num_layers
+        self.up_sample = up_sample
+        self.t_emb_dim = t_emb_dim
+        self.cross_attn = cross_attn
+        self.context_dim = context_dim
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
+                              padding=1),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers)
+            ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.attention_norms = nn.ModuleList(
+            [
+                nn.GroupNorm(norm_channels, out_channels)
+                for _ in range(num_layers)
+            ]
+        )
+        self.attentions = nn.ModuleList(
+            [
+                nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                for _ in range(num_layers)
+            ]
+        )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers)
+            ]
+        )
+        self.up_sample_conv = nn.ConvTranspose2d(in_channels // 2, in_channels // 2,
+                                                 4, 2, 1) \
+            if self.up_sample else nn.Identity()
+    def forward(self, x, out_down=None, t_emb=None, context=None):
+        x = self.up_sample_conv(x)
+        if out_down is not None:
+            x = torch.cat([x, out_down], dim=1)
+        out = x
+        for i in range(self.num_layers):
+            # Resnet
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            # Self Attention
+            batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
+            in_attn = self.attention_norms[i](in_attn)
+            in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+            out = out + out_attn
+            # Cross Attention
+            if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert len(context.shape) == 3, \
+                    "Context shape does not match B,_,CONTEXT_DIM"
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim,\
+                    "Context shape does not match B,_,CONTEXT_DIM"
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        return out
+"""Vqvae"""
+import torch
+import torch.nn as nn
+class VQVAE(nn.Module):
+    def __init__(self, im_channels, model_config):
+        super().__init__()
+        self.down_channels = model_config.down_channels
+        self.mid_channels = model_config.mid_channels
+        self.down_sample = model_config.down_sample
+        self.num_down_layers = model_config.num_down_layers
+        self.num_mid_layers = model_config.num_mid_layers
+        self.num_up_layers = model_config.num_up_layers
+        # To disable attention in Downblock of Encoder and Upblock of Decoder
+        self.attns = model_config.attn_down
+        # Latent Dimension
+        self.z_channels = model_config.z_channels
+        self.codebook_size = model_config.codebook_size
+        self.norm_channels = model_config.norm_channels
+        self.num_heads = model_config.num_heads
+        # Assertion to validate the channel information
+        assert self.mid_channels[0] == self.down_channels[-1]
+        assert self.mid_channels[-1] == self.down_channels[-1]
+        assert len(self.down_sample) == len(self.down_channels) - 1
+        assert len(self.attns) == len(self.down_channels) - 1
+        # Wherever we use downsampling in encoder correspondingly use
+        # upsampling in decoder
+        self.up_sample = list(reversed(self.down_sample))
+        ##################### Encoder ######################
+        self.encoder_conv_in = nn.Conv2d(im_channels, self.down_channels[0], kernel_size=3, padding=(1, 1))
+        # Downblock + Midblock
+        self.encoder_layers = nn.ModuleList([])
+        for i in range(len(self.down_channels) - 1):
+            self.encoder_layers.append(DownBlock(self.down_channels[i], self.down_channels[i + 1],
+                                                 t_emb_dim=None, down_sample=self.down_sample[i],
+                                                 num_heads=self.num_heads,
+                                                 num_layers=self.num_down_layers,
+                                                 attn=self.attns[i],
+                                                 norm_channels=self.norm_channels))
+        self.encoder_mids = nn.ModuleList([])
+        for i in range(len(self.mid_channels) - 1):
+            self.encoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i + 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.encoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[-1])
+        self.encoder_conv_out = nn.Conv2d(self.down_channels[-1], self.z_channels, kernel_size=3, padding=1)
+        # Pre Quantization Convolution
+        self.pre_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        # Codebook
+        self.embedding = nn.Embedding(self.codebook_size, self.z_channels)
+        ####################################################
+        ##################### Decoder ######################
+        # Post Quantization Convolution
+        self.post_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        self.decoder_conv_in = nn.Conv2d(self.z_channels, self.mid_channels[-1], kernel_size=3, padding=(1, 1))
+        # Midblock + Upblock
+        self.decoder_mids = nn.ModuleList([])
+        for i in reversed(range(1, len(self.mid_channels))):
+            self.decoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i - 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.decoder_layers = nn.ModuleList([])
+        for i in reversed(range(1, len(self.down_channels))):
+            self.decoder_layers.append(UpBlock(self.down_channels[i], self.down_channels[i - 1],
+                                               t_emb_dim=None, up_sample=self.down_sample[i - 1],
+                                               num_heads=self.num_heads,
+                                               num_layers=self.num_up_layers,
+                                               attn=self.attns[i-1],
+                                               norm_channels=self.norm_channels))
+        self.decoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[0])
+        self.decoder_conv_out = nn.Conv2d(self.down_channels[0], im_channels, kernel_size=3, padding=1)
+    def quantize(self, x):
+        B, C, H, W = x.shape
+        # B, C, H, W -> B, H, W, C
+        x = x.permute(0, 2, 3, 1)
+        # B, H, W, C -> B, H*W, C
+        x = x.reshape(x.size(0), -1, x.size(-1))
+        # Find nearest embedding/codebook vector
+        # dist between (B, H*W, C) and (B, K, C) -> (B, H*W, K)
+        dist = torch.cdist(x, self.embedding.weight[None, :].repeat((x.size(0), 1, 1)))
+        # (B, H*W)
+        min_encoding_indices = torch.argmin(dist, dim=-1)
+        # Replace encoder output with nearest codebook
+        # quant_out -> B*H*W, C
+        quant_out = torch.index_select(self.embedding.weight, 0, min_encoding_indices.view(-1))
+        # x -> B*H*W, C
+        x = x.reshape((-1, x.size(-1)))
+        commmitment_loss = torch.mean((quant_out.detach() - x) ** 2)
+        codebook_loss = torch.mean((quant_out - x.detach()) ** 2)
+        quantize_losses = {
+            'codebook_loss': codebook_loss,
+            'commitment_loss': commmitment_loss
+        }
+        # Straight through estimation
+        quant_out = x + (quant_out - x).detach()
+        # quant_out -> B, C, H, W
+        quant_out = quant_out.reshape((B, H, W, C)).permute(0, 3, 1, 2)
+        min_encoding_indices = min_encoding_indices.reshape((-1, quant_out.size(-2), quant_out.size(-1)))
+        return quant_out, quantize_losses, min_encoding_indices
+    def encode(self, x):
+        out = self.encoder_conv_in(x)
+        for idx, down in enumerate(self.encoder_layers):
+            out = down(out)
+        for mid in self.encoder_mids:
+            out = mid(out)
+        out = self.encoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.encoder_conv_out(out)
+        out = self.pre_quant_conv(out)
+        out, quant_losses, _ = self.quantize(out)
+        return out, quant_losses
+    def decode(self, z):
+        out = z
+        out = self.post_quant_conv(out)
+        out = self.decoder_conv_in(out)
+        for mid in self.decoder_mids:
+            out = mid(out)
+        for idx, up in enumerate(self.decoder_layers):
+            out = up(out)
+        out = self.decoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.decoder_conv_out(out)
+        return out
+    def forward(self, x):
+        z, quant_losses = self.encode(x)
+        out = self.decode(z)
+        return out, z, quant_losses
+"""Vae"""
+import torch
+import torch.nn as nn
+class VAE(nn.Module):
+    def __init__(self, im_channels, model_config):
+        super().__init__()
+        self.down_channels = model_config['down_channels']
+        self.mid_channels = model_config['mid_channels']
+        self.down_sample = model_config['down_sample']
+        self.num_down_layers = model_config['num_down_layers']
+        self.num_mid_layers = model_config['num_mid_layers']
+        self.num_up_layers = model_config['num_up_layers']
+        # To disable attention in Downblock of Encoder and Upblock of Decoder
+        self.attns = model_config['attn_down']
+        # Latent Dimension
+        self.z_channels = model_config['z_channels']
+        self.norm_channels = model_config['norm_channels']
+        self.num_heads = model_config['num_heads']
+        # Assertion to validate the channel information
+        assert self.mid_channels[0] == self.down_channels[-1]
+        assert self.mid_channels[-1] == self.down_channels[-1]
+        assert len(self.down_sample) == len(self.down_channels) - 1
+        assert len(self.attns) == len(self.down_channels) - 1
+        # Wherever we use downsampling in encoder correspondingly use
+        # upsampling in decoder
+        self.up_sample = list(reversed(self.down_sample))
+        ##################### Encoder ######################
+        self.encoder_conv_in = nn.Conv2d(im_channels, self.down_channels[0], kernel_size=3, padding=(1, 1))
+        # Downblock + Midblock
+        self.encoder_layers = nn.ModuleList([])
+        for i in range(len(self.down_channels) - 1):
+            self.encoder_layers.append(DownBlock(self.down_channels[i], self.down_channels[i + 1],
+                                                 t_emb_dim=None, down_sample=self.down_sample[i],
+                                                 num_heads=self.num_heads,
+                                                 num_layers=self.num_down_layers,
+                                                 attn=self.attns[i],
+                                                 norm_channels=self.norm_channels))
+        self.encoder_mids = nn.ModuleList([])
+        for i in range(len(self.mid_channels) - 1):
+            self.encoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i + 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.encoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[-1])
+        self.encoder_conv_out = nn.Conv2d(self.down_channels[-1], 2*self.z_channels, kernel_size=3, padding=1)
+        # Latent Dimension is 2*Latent because we are predicting mean & variance
+        self.pre_quant_conv = nn.Conv2d(2*self.z_channels, 2*self.z_channels, kernel_size=1)
+        ####################################################
+        ##################### Decoder ######################
+        self.post_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        self.decoder_conv_in = nn.Conv2d(self.z_channels, self.mid_channels[-1], kernel_size=3, padding=(1, 1))
+        # Midblock + Upblock
+        self.decoder_mids = nn.ModuleList([])
+        for i in reversed(range(1, len(self.mid_channels))):
+            self.decoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i - 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.decoder_layers = nn.ModuleList([])
+        for i in reversed(range(1, len(self.down_channels))):
+            self.decoder_layers.append(UpBlock(self.down_channels[i], self.down_channels[i - 1],
+                                               t_emb_dim=None, up_sample=self.down_sample[i - 1],
+                                               num_heads=self.num_heads,
+                                               num_layers=self.num_up_layers,
+                                               attn=self.attns[i - 1],
+                                               norm_channels=self.norm_channels))
+        self.decoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[0])
+        self.decoder_conv_out = nn.Conv2d(self.down_channels[0], im_channels, kernel_size=3, padding=1)
+    def encode(self, x):
+        out = self.encoder_conv_in(x)
+        for idx, down in enumerate(self.encoder_layers):
+            out = down(out)
+        for mid in self.encoder_mids:
+            out = mid(out)
+        out = self.encoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.encoder_conv_out(out)
+        out = self.pre_quant_conv(out)
+        mean, logvar = torch.chunk(out, 2, dim=1)
+        std = torch.exp(0.5 * logvar)
+        sample = mean + std * torch.randn(mean.shape).to(device=x.device)
+        return sample, out
+    def decode(self, z):
+        out = z
+        out = self.post_quant_conv(out)
+        out = self.decoder_conv_in(out)
+        for mid in self.decoder_mids:
+            out = mid(out)
+        for idx, up in enumerate(self.decoder_layers):
+            out = up(out)
+        out = self.decoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.decoder_conv_out(out)
+        return out
+    def forward(self, x):
+        z, encoder_output = self.encode(x)
+        out = self.decode(z)
+        return out, encoder_output
+"""Discriminator"""
+import torch
+import torch.nn as nn
+class Discriminator(nn.Module):
+    r"""
+    PatchGAN Discriminator.
+    Rather than taking IMG_CHANNELSxIMG_HxIMG_W all the way to
+    1 scalar value , we instead predict grid of values.
+    Where each grid is prediction of how likely
+    the discriminator thinks that the image patch corresponding
+    to the grid cell is real
+    """
+    def __init__(self, im_channels=3,
+                 conv_channels=[64, 128, 256],
+                 kernels=[4,4,4,4],
+                 strides=[2,2,2,1],
+                 paddings=[1,1,1,1]):
+        super().__init__()
+        self.im_channels = im_channels
+        activation = nn.LeakyReLU(0.2)
+        layers_dim = [self.im_channels] + conv_channels + [1]
+        self.layers = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(layers_dim[i], layers_dim[i + 1],
+                          kernel_size=kernels[i],
+                          stride=strides[i],
+                          padding=paddings[i],
+                          bias=False if i !=0 else True),
+                nn.BatchNorm2d(layers_dim[i + 1]) if i != len(layers_dim) - 2 and i != 0 else nn.Identity(),
+                activation if i != len(layers_dim) - 2 else nn.Identity()
+            )
+            for i in range(len(layers_dim) - 1)
+        ])
+    def forward(self, x):
+        out = x
+        for layer in self.layers:
+            out = layer(out)
+        return out
+# if __name__ == '__main__':
+#     x = torch.randn((2,3, 256, 256))
+#     prob = Discriminator(im_channels=3)(x)
+#     print(prob.shape)
+# import os
+# image_paths = [os.path.join("/home/taruntejaneurips23/Ashish/datasets/animefacedata/images", f)
+#                for f in os.listdir("/home/taruntejaneurips23/Ashish/datasets/animefacedata/images")]
+# image_paths
+import glob
+import os
+import torchvision
+from PIL import Image
+from tqdm import tqdm, trange
+# from utils.diffusion_utils import load_latents
+from torch.utils.data.dataset import Dataset
+import pickle
+import glob
+import os
+import torch
+def load_latents(latent_path):
+    r"""
+    Simple utility to save latents to speed up ldm training
+    :param latent_path:
+    :return:
+    """
+    latent_maps = {}
+    for fname in glob.glob(os.path.join(latent_path, '*.pkl')):
+        s = pickle.load(open(fname, 'rb'))
+        for k, v in s.items():
+            latent_maps[k] = v[0]
+    return latent_maps
+def drop_text_condition(text_embed, im, empty_text_embed, text_drop_prob):
+    if text_drop_prob > 0:
+        text_drop_mask = torch.zeros((im.shape[0]), device=im.device).float().uniform_(0,
+                                                                                       1) < text_drop_prob
+        assert empty_text_embed is not None, ("Text Conditioning required as well as"
+                                        " text dropping but empty text representation not created")
+        text_embed[text_drop_mask, :, :] = empty_text_embed[0]
+    return text_embed
+def drop_image_condition(image_condition, im, im_drop_prob):
+    if im_drop_prob > 0:
+        im_drop_mask = torch.zeros((im.shape[0], 1, 1, 1), device=im.device).float().uniform_(0,
+                                                                                        1) > im_drop_prob
+        return image_condition * im_drop_mask
+    else:
+        return image_condition
+def drop_class_condition(class_condition, class_drop_prob, im):
+    if class_drop_prob > 0:
+        class_drop_mask = torch.zeros((im.shape[0], 1), device=im.device).float().uniform_(0,
+                                                                                           1) > class_drop_prob
+        return class_condition * class_drop_mask
+    else:
+        return class_condition
+class MnistDataset(Dataset):
+    r"""
+    Nothing special here. Just a simple dataset class for mnist images.
+    Created a dataset class rather using torchvision to allow
+    replacement with any other image dataset
+    """
+    def __init__(self, split, im_path, im_size, im_channels,
+                 use_latents=False, latent_path=None, condition_config=None):
+        r"""
+        Init method for initializing the dataset properties
+        :param split: train/test to locate the image files
+        :param im_path: root folder of images
+        :param im_ext: image extension. assumes all
+        images would be this type.
+        """
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        # Should we use latents or not
+        self.latent_maps = None
+        self.use_latents = False
+        # Conditioning for the dataset
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.images, self.labels = self.load_images(im_path)
+        # Whether to load images and call vae or to load latents
+        if use_latents and latent_path is not None:
+            latent_maps = load_latents(latent_path)
+            if len(latent_maps) == len(self.images):
+                self.use_latents = True
+                self.latent_maps = latent_maps
+                print('Found {} latents'.format(len(self.latent_maps)))
+            else:
+                print('Latents not found')
+    def load_images(self, im_path):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        :param im_path:
+        :return:
+        """
+        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
+        ims = []
+        labels = []
+        for d_name in tqdm(os.listdir(im_path)):
+            fnames = glob.glob(os.path.join(im_path, d_name, '*.{}'.format('png')))
+            fnames += glob.glob(os.path.join(im_path, d_name, '*.{}'.format('jpg')))
+            fnames += glob.glob(os.path.join(im_path, d_name, '*.{}'.format('jpeg')))
+            for fname in fnames:
+                ims.append(fname)
+                if 'class' in self.condition_types:
+                    labels.append(int(d_name))
+        print('Found {} images for split {}'.format(len(ims), self.split))
+        return ims, labels
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        cond_inputs = {}
+        if 'class' in self.condition_types:
+            cond_inputs['class'] = self.labels[index]
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            else:
+                return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.ToTensor()(im)
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            else:
+                return im_tensor, cond_inputs
+class AnimeFaceDataset(Dataset):
+    def __init__(self, split, im_path, im_size, im_channels,
+                 use_latents=False, latent_path=None, condition_config=None):
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        # Should we use latents or not
+        self.latent_maps = None
+        self.use_latents = False
+        # Conditioning for the dataset
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.images = self.load_images(im_path)
+        # Whether to load images and call vae or to load latents
+        if use_latents and latent_path is not None:
+            latent_maps = load_latents(latent_path)
+            if len(latent_maps) == len(self.images):
+                self.use_latents = True
+                self.latent_maps = latent_maps
+                print('Found {} latents'.format(len(self.latent_maps)))
+            else:
+                print('Latents not found')
+    def load_images(self, im_path):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        :param im_path:
+        :return:
+        """
+        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
+        # ims = []
+        # labels = []
+        ims = [os.path.join(im_path, f) for f in os.listdir(im_path)]
+        return ims
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        # cond_inputs = {}
+        # if 'class' in self.condition_types:
+        #     cond_inputs['class'] = self.labels[index]
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            # else:
+            #     return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.Compose([
+                torchvision.transforms.Resize(self.im_size),
+                torchvision.transforms.CenterCrop(self.im_size),
+                torchvision.transforms.ToTensor(),
+            ])(im)
+            im.close()
+            # im_tensor = torchvision.transforms.ToTensor()(im)
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            # else:
+            #     return im_tensor, cond_inputs
+import glob
+import os
+import random
+import torch
+import torchvision
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data.dataset import Dataset
+class CelebDataset(Dataset):
+    def __init__(self, split, im_path, im_size, im_channels,
+                 use_latents=False, latent_path=None, condition_config=None):
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        # Should we use latents or not
+        self.latent_maps = None
+        self.use_latents = False
+        # Conditioning for the dataset
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.images = self.load_images(im_path)
+        # Whether to load images and call vae or to load latents
+        if use_latents and latent_path is not None:
+            latent_maps = load_latents(latent_path)
+            if len(latent_maps) == len(self.images):
+                self.use_latents = True
+                self.latent_maps = latent_maps
+                print('Found {} latents'.format(len(self.latent_maps)))
+            else:
+                print('Latents not found')
+    def load_images(self, im_path):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        :param im_path:
+        :return:
+        """
+        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
+        # ims = []
+        # labels = []
+        ims = [os.path.join(im_path, f) for f in os.listdir(im_path)]
+        return ims
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        # cond_inputs = {}
+        # if 'class' in self.condition_types:
+        #     cond_inputs['class'] = self.labels[index]
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            # else:
+            #     return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.Compose([
+                # torchvision.transforms.Resize(self.im_size),
+                torchvision.transforms.CenterCrop(self.im_size),
+                torchvision.transforms.ToTensor(),
+            ])(im)
+            im.close()
+            # im_tensor = torchvision.transforms.ToTensor()(im)
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            # else:
+            #     return im_tensor, cond_inputs
+import pandas as pd
+class CelebHairDataset(Dataset):
+    def __init__(self, split, im_path, im_size, im_channels,
+                 use_latents=False, latent_path=None, condition_config=None):
+        self.df = pd.read_csv("/home/taruntejaneurips23/Ashish/DDPM/hair_df_100.csv")
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        # Should we use latents or not
+        self.latent_maps = None
+        self.use_latents = False
+        # Conditioning for the dataset
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.images = self.load_images(im_path, self.df)
+        # Whether to load images and call vae or to load latents
+        if use_latents and latent_path is not None:
+            latent_maps = load_latents(latent_path)
+            if len(latent_maps) == len(self.images):
+                self.use_latents = True
+                self.latent_maps = latent_maps
+                print('Found {} latents'.format(len(self.latent_maps)))
+            else:
+                print('Latents not found')
+    def load_images(self, im_path, df):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        :param im_path:
+        :return:
+        """
+        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
+        # ims = []
+        # labels = []
+        # ims = [os.path.join(im_path, f) for f in os.listdir(im_path)]
+        ims = [os.path.join(im_path, i) for i in df.image_id.values]
+        return ims
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        # cond_inputs = {}
+        # if 'class' in self.condition_types:
+        #     cond_inputs['class'] = self.labels[index]
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            # else:
+            #     return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.Compose([
+                # torchvision.transforms.Resize(self.im_size),
+                torchvision.transforms.CenterCrop(self.im_size),
+                torchvision.transforms.ToTensor(),
+            ])(im)
+            im.close()
+            # im_tensor = torchvision.transforms.ToTensor()(im)
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            # else:
+            #     return im_tensor, cond_inputs
+#"""Train VQVAE"""...............................................................................................................................................
+# Commented out IPython magic to ensure Python compatibility.
+import torch
+import torch.nn as nn
+import yaml
+from dotdict import DotDict
+config_path = "/home/taruntejaneurips23/Ashish/DDPM/_5_ldm_celeba.yaml"
+with open(config_path, 'r') as file:
+    Config = yaml.safe_load(file)
+Config = DotDict.from_dict(Config)
+dataset_config = Config.dataset_params
+diffusion_config = Config.diffusion_params
+model_config = Config.model_params
+train_config = Config.train_params
+import torch
+import os
+import random
+import numpy as np
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from torch.optim import Adam
+from torch.utils.data import Dataset, TensorDataset, DataLoader
+# device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+from torchvision.utils import make_grid
+def trainVAE(Config):
+    dataset_config = Config.dataset_params
+    autoencoder_config = Config.autoencoder_params
+    train_config = Config.train_params
+    # Set the desired seed value #
+    seed = train_config.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    if device == 'cuda':
+        torch.cuda.manual_seed_all(seed)
+    #############################
+    # Create the model and dataset #
+    model = VQVAE(im_channels=dataset_config.im_channels,
+                  model_config=autoencoder_config).to(device)
+    # model.load_state_dict(torch.load("/home/taruntejaneurips23/Ashish/DDPM/celebAhair_ldm/vqvae_autoencoder_ckpt.pth", map_location=device))
+    if os.path.exists(os.path.join(train_config.task_name, train_config.vqvae_autoencoder_ckpt_name)):
+        print('Loaded vae checkpoint')
+        model.load_state_dict(torch.load(os.path.join(train_config.task_name, train_config.vqvae_autoencoder_ckpt_name),
+                                         map_location=device, weights_only=True))
+    # Create the dataset
+    im_dataset_cls = {
+        'mnist': MnistDataset,
+        'celebA': CelebDataset,
+        'animeface': AnimeFaceDataset,
+        'celebAhair': CelebHairDataset
+    }.get(dataset_config.name)
+    im_dataset = im_dataset_cls(split='train',
+                                im_path=dataset_config.im_path,
+                                im_size=dataset_config.im_size,
+                                im_channels=dataset_config.im_channels)
+    data_loader = DataLoader(im_dataset,
+                             batch_size=train_config.autoencoder_batch_size,
+                             shuffle=True,
+                             num_workers=os.cpu_count(),
+                             pin_memory=True,
+                             drop_last=True,
+                             persistent_workers=True, pin_memory_device=device)
+    # Create output directories
+    if not os.path.exists(train_config.task_name):
+        os.mkdir(train_config.task_name)
+    num_epochs = train_config.autoencoder_epochs
+    # L1/L2 loss for Reconstruction
+    recon_criterion = torch.nn.MSELoss()
+    # Disc Loss can even be BCEWithLogits
+    disc_criterion = torch.nn.MSELoss()
+    # No need to freeze lpips as lpips.py takes care of that
+    lpips_model = LPIPS().eval().to(device)
+    discriminator = Discriminator(im_channels=dataset_config.im_channels).to(device)
+    # discriminator.load_state_dict(torch.load("/home/taruntejaneurips23/Ashish/DDPM/celebAhair_ldm/vqvae_discriminator_ckpt.pth", map_location=device))
+    if os.path.exists(os.path.join(train_config.task_name, train_config.vqvae_discriminator_ckpt_name)):
+        print('Loaded discriminator checkpoint')
+        discriminator.load_state_dict(torch.load(os.path.join(train_config.task_name, train_config.vqvae_discriminator_ckpt_name),
+                                         map_location=device, weights_only=True))
+    optimizer_d = Adam(discriminator.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    optimizer_g = Adam(model.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    disc_step_start = train_config.disc_start
+    step_count = 0
+    # This is for accumulating gradients incase the images are huge
+    # And one cant afford higher batch sizes
+    acc_steps = train_config.autoencoder_acc_steps
+    image_save_steps = train_config.autoencoder_img_save_steps
+    img_save_count = 0
+    for epoch_idx in trange(num_epochs, desc='Training VQVAE'):
+        recon_losses = []
+        codebook_losses = []
+        #commitment_losses = []
+        perceptual_losses = []
+        disc_losses = []
+        gen_losses = []
+        losses = []
+        optimizer_g.zero_grad()
+        optimizer_d.zero_grad()
+        # for im in tqdm(data_loader):
+        for im in data_loader:
+            step_count += 1
+            im = im.float().to(device)
+            # Fetch autoencoders output(reconstructions)
+            model_output = model(im)
+            output, z, quantize_losses = model_output
+            # Image Saving Logic
+            if step_count % image_save_steps == 0 or step_count == 1:
+                sample_size = min(8, im.shape[0])
+                save_output = torch.clamp(output[:sample_size], -1., 1.).detach().cpu()
+                save_output = ((save_output + 1) / 2)
+                save_input = ((im[:sample_size] + 1) / 2).detach().cpu()
+                grid = make_grid(torch.cat([save_input, save_output], dim=0), nrow=sample_size)
+                img = torchvision.transforms.ToPILImage()(grid)
+                if not os.path.exists(os.path.join(train_config.task_name,'vqvae_autoencoder_samples')):
+                    os.mkdir(os.path.join(train_config.task_name, 'vqvae_autoencoder_samples'))
+                img.save(os.path.join(train_config.task_name,'vqvae_autoencoder_samples',
+                                      'current_autoencoder_sample_{}.png'.format(img_save_count)))
+                img_save_count += 1
+                img.close()
+            ######### Optimize Generator ##########
+            # L2 Loss
+            recon_loss = recon_criterion(output, im)
+            recon_losses.append(recon_loss.item())
+            recon_loss = recon_loss / acc_steps
+            g_loss = (recon_loss +
+                      (train_config.codebook_weight * quantize_losses['codebook_loss'] / acc_steps) +
+                      (train_config.commitment_beta * quantize_losses['commitment_loss'] / acc_steps))
+            codebook_losses.append(train_config.codebook_weight * quantize_losses['codebook_loss'].item())
+            # Adversarial loss only if disc_step_start steps passed
+            if step_count > disc_step_start:
+                disc_fake_pred = discriminator(model_output[0])
+                disc_fake_loss = disc_criterion(disc_fake_pred,
+                                                torch.ones(disc_fake_pred.shape,
+                                                           device=disc_fake_pred.device))
+                gen_losses.append(train_config.disc_weight * disc_fake_loss.item())
+                g_loss += train_config.disc_weight * disc_fake_loss / acc_steps
+            lpips_loss = torch.mean(lpips_model(output, im)) / acc_steps
+            perceptual_losses.append(train_config.perceptual_weight * lpips_loss.item())
+            g_loss += train_config.perceptual_weight*lpips_loss / acc_steps
+            losses.append(g_loss.item())
+            g_loss.backward()
+            #####################################
+            ######### Optimize Discriminator #######
+            if step_count > disc_step_start:
+                fake = output
+                disc_fake_pred = discriminator(fake.detach())
+                disc_real_pred = discriminator(im)
+                disc_fake_loss = disc_criterion(disc_fake_pred,
+                                                torch.zeros(disc_fake_pred.shape,
+                                                            device=disc_fake_pred.device))
+                disc_real_loss = disc_criterion(disc_real_pred,
+                                                torch.ones(disc_real_pred.shape,
+                                                           device=disc_real_pred.device))
+                disc_loss = train_config.disc_weight * (disc_fake_loss + disc_real_loss) / 2
+                disc_losses.append(disc_loss.item())
+                disc_loss = disc_loss / acc_steps
+                disc_loss.backward()
+                if step_count % acc_steps == 0:
+                    optimizer_d.step()
+                    optimizer_d.zero_grad()
+            #####################################
+            if step_count % acc_steps == 0:
+                optimizer_g.step()
+                optimizer_g.zero_grad()
+        optimizer_d.step()
+        optimizer_d.zero_grad()
+        optimizer_g.step()
+        optimizer_g.zero_grad()
+        if len(disc_losses) > 0:
+            print(
+                'Finished epoch: {}/{} | Recon Loss : {:.4f} | Perceptual Loss : {:.4f} | '
+                'Codebook : {:.4f} | G Loss : {:.4f} | D Loss {:.4f}'.
+                format(epoch_idx + 1,
+                       num_epochs,
+                       np.mean(recon_losses),
+                       np.mean(perceptual_losses),
+                       np.mean(codebook_losses),
+                       np.mean(gen_losses),
+                       np.mean(disc_losses)))
+        else:
+            print('Finished epoch: {}/{} | Recon Loss : {:.4f} | Perceptual Loss : {:.4f} | Codebook : {:.4f}'.
+                  format(epoch_idx + 1,
+                         num_epochs,
+                         np.mean(recon_losses),
+                         np.mean(perceptual_losses),
+                         np.mean(codebook_losses)))
+        torch.save(model.state_dict(), os.path.join(train_config.task_name,
+                                                    train_config.vqvae_autoencoder_ckpt_name))
+        torch.save(discriminator.state_dict(), os.path.join(train_config.task_name,
+                                                            train_config.vqvae_discriminator_ckpt_name))
+    print('Done Training...')
+# trainVAE(Config)
+import torch
+import torch.nn as nn
+class Unet(nn.Module):
+    r"""
+    Unet model comprising
+    Down blocks, Midblocks and Uplocks
+    """
+    def __init__(self, im_channels, model_config):
+        super().__init__()
+        self.down_channels = model_config.down_channels
+        self.mid_channels = model_config.mid_channels
+        self.t_emb_dim = model_config.time_emb_dim
+        self.down_sample = model_config.down_sample
+        self.num_down_layers = model_config.num_down_layers
+        self.num_mid_layers = model_config.num_mid_layers
+        self.num_up_layers = model_config.num_up_layers
+        self.attns = model_config.attn_down
+        self.norm_channels = model_config.norm_channels
+        self.num_heads = model_config.num_heads
+        self.conv_out_channels = model_config.conv_out_channels
+        assert self.mid_channels[0] == self.down_channels[-1]
+        assert self.mid_channels[-1] == self.down_channels[-2]
+        assert len(self.down_sample) == len(self.down_channels) - 1
+        assert len(self.attns) == len(self.down_channels) - 1
+        # Initial projection from sinusoidal time embedding
+        self.t_proj = nn.Sequential(
+            nn.Linear(self.t_emb_dim, self.t_emb_dim),
+            nn.SiLU(),
+            nn.Linear(self.t_emb_dim, self.t_emb_dim)
+        )
+        self.up_sample = list(reversed(self.down_sample))
+        self.conv_in = nn.Conv2d(im_channels, self.down_channels[0], kernel_size=3, padding=1)
+        self.downs = nn.ModuleList([])
+        for i in range(len(self.down_channels) - 1):
+            self.downs.append(DownBlock(self.down_channels[i], self.down_channels[i + 1], self.t_emb_dim,
+                                        down_sample=self.down_sample[i],
+                                        num_heads=self.num_heads,
+                                        num_layers=self.num_down_layers,
+                                        attn=self.attns[i], norm_channels=self.norm_channels))
+        self.mids = nn.ModuleList([])
+        for i in range(len(self.mid_channels) - 1):
+            self.mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i + 1], self.t_emb_dim,
+                                      num_heads=self.num_heads,
+                                      num_layers=self.num_mid_layers,
+                                      norm_channels=self.norm_channels))
+        self.ups = nn.ModuleList([])
+        for i in reversed(range(len(self.down_channels) - 1)):
+            self.ups.append(UpBlockUnet(self.down_channels[i] * 2, self.down_channels[i - 1] if i != 0 else self.conv_out_channels,
+                                    self.t_emb_dim, up_sample=self.down_sample[i],
+                                        num_heads=self.num_heads,
+                                        num_layers=self.num_up_layers,
+                                        norm_channels=self.norm_channels))
+        self.norm_out = nn.GroupNorm(self.norm_channels, self.conv_out_channels)
+        self.conv_out = nn.Conv2d(self.conv_out_channels, im_channels, kernel_size=3, padding=1)
+    def forward(self, x, t):
+        # Shapes assuming downblocks are [C1, C2, C3, C4]
+        # Shapes assuming midblocks are [C4, C4, C3]
+        # Shapes assuming downsamples are [True, True, False]
+        # B x C x H x W
+        out = self.conv_in(x)
+        # B x C1 x H x W
+        # t_emb -> B x t_emb_dim
+        t_emb = get_time_embedding(torch.as_tensor(t).long(), self.t_emb_dim)
+        t_emb = self.t_proj(t_emb)
+        down_outs = []
+        for idx, down in enumerate(self.downs):
+            down_outs.append(out)
+            out = down(out, t_emb)
+        # down_outs  [B x C1 x H x W, B x C2 x H/2 x W/2, B x C3 x H/4 x W/4]
+        # out B x C4 x H/4 x W/4
+        for mid in self.mids:
+            out = mid(out, t_emb)
+        # out B x C3 x H/4 x W/4
+        for up in self.ups:
+            down_out = down_outs.pop()
+            out = up(out, down_out, t_emb)
+            # out [B x C2 x H/4 x W/4, B x C1 x H/2 x W/2, B x 16 x H x W]
+        out = self.norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.conv_out(out)
+        # out B x C x H x W
+        return out
+def trainLDM(Config):
+    diffusion_config = Config.diffusion_params
+    dataset_config = Config.dataset_params
+    diffusion_model_config = Config.ldm_params
+    autoencoder_model_config = Config.autoencoder_params
+    train_config = Config.train_params
+    # Create the noise scheduler
+    scheduler = LinearNoiseScheduler(num_timesteps=diffusion_config.num_timesteps,
+                                     beta_start=diffusion_config.beta_start,
+                                     beta_end=diffusion_config.beta_end)
+    # scheduler = CosineNoiseScheduler(diffusion_config.num_timesteps)
+    im_dataset_cls = {
+        'mnist': MnistDataset,
+        'celebA': CelebDataset,
+        'animeface': AnimeFaceDataset,
+        'celebAhair': CelebHairDataset
+    }.get(dataset_config.name)
+    im_dataset = im_dataset_cls(split='train',
+                                im_path=dataset_config.im_path,
+                                im_size=dataset_config.im_size,
+                                im_channels=dataset_config.im_channels,
+                                use_latents=True,
+                                latent_path=os.path.join(train_config.task_name,
+                                                         train_config.vqvae_latent_dir_name)
+                                )
+    data_loader = DataLoader(im_dataset,
+                             batch_size=train_config.ldm_batch_size,
+                             shuffle=True,
+                             num_workers=os.cpu_count(),
+                             pin_memory=True,
+                             drop_last=False,
+                             persistent_workers=True, pin_memory_device=device)
+    # Instantiate the model
+    model = Unet(im_channels=autoencoder_model_config.z_channels,
+                 model_config=diffusion_model_config).to(device)
+    if os.path.exists(os.path.join(train_config.task_name, train_config.ldm_ckpt_name)):
+        print('Loaded ldm checkpoint')
+        model.load_state_dict(torch.load(os.path.join(train_config.task_name, train_config.ldm_ckpt_name), map_location=device, weights_only=True))
+    model.train()
+    # Load VAE ONLY if latents are not to be used or are missing
+    if not im_dataset.use_latents:
+        print('Loading vqvae model as latents not present')
+        vae = VQVAE(im_channels=dataset_config.im_channels,
+                    model_config=autoencoder_model_config).to(device)
+        vae.eval()
+        # Load vae if found
+        if os.path.exists(os.path.join(train_config.task_name,
+                                       train_config.vqvae_autoencoder_ckpt_name)):
+            print('Loaded vae checkpoint')
+            vae.load_state_dict(torch.load(os.path.join(train_config.task_name,
+                                                        train_config.vqvae_autoencoder_ckpt_name),
+                                           map_location=device))
+    # Specify training parameters
+    num_epochs = train_config.ldm_epochs
+    optimizer = Adam(model.parameters(), lr=train_config.ldm_lr)
+    criterion = torch.nn.MSELoss()
+    # Run training
+    if not im_dataset.use_latents:
+        for param in vae.parameters():
+            param.requires_grad = False
+    for epoch_idx in range(num_epochs):
+        losses = []
+        for im in tqdm(data_loader):
+            optimizer.zero_grad()
+            im = im.float().to(device)
+            if not im_dataset.use_latents:
+                with torch.no_grad():
+                    im, _ = vae.encode(im)
+            # Sample random noise
+            noise = torch.randn_like(im).to(device)
+            # Sample timestep
+            t = torch.randint(0, diffusion_config.num_timesteps, (im.shape[0],)).to(device)
+            # Add noise to images according to timestep
+            noisy_im = scheduler.add_noise(im, noise, t)
+            noise_pred = model(noisy_im, t)
+            loss = criterion(noise_pred, noise)
+            losses.append(loss.item())
+            loss.backward()
+            optimizer.step()
+        print(f'Finished epoch:{epoch_idx + 1}/{num_epochs} | Loss : {np.mean(losses):.4f}')
+        torch.save(model.state_dict(), os.path.join(train_config.task_name,
+                                                    train_config.ldm_ckpt_name))
+        # Doing Inference
+        infer(Config)
+        # Checking to conntinue training
+        train_continue = yaml.safe_load(open("/home/taruntejaneurips23/Ashish/DDPM/_5_ldm_celeba.yaml", 'r'))
+        train_continue = DotDict.from_dict(train_continue)
+        if train_continue.training._continue_ == False:
+            print('Training Stoped ...')
+            break
+    print('Done Training ...')
+# trainLDM(Config)
+# import subprocess
+# subprocess.run(f'kill {os.getpid()}', shell=True, check=True)
+def sample(model, scheduler, train_config, diffusion_model_config,
+               autoencoder_model_config, diffusion_config, dataset_config, vae):
+    r"""
+    Sample stepwise by going backward one timestep at a time.
+    We save the x0 predictions
+    """
+    im_size = dataset_config.im_size // 2**sum(autoencoder_model_config.down_sample)
+    xt = torch.randn((train_config.num_samples,
+                      autoencoder_model_config.z_channels,
+                      im_size,
+                      im_size)).to(device)
+    save_count = 0
+    for i in tqdm(reversed(range(diffusion_config.num_timesteps)), total=diffusion_config.num_timesteps):
+        # Get prediction of noise
+        noise_pred = model(xt, torch.as_tensor(i).unsqueeze(0).to(device))
+        # Use scheduler to get x0 and xt-1
+        xt, x0_pred = scheduler.sample_prev_timestep(xt, noise_pred, torch.as_tensor(i).to(device))
+        # Save x0
+        #ims = torch.clamp(xt, -1., 1.).detach().cpu()
+        if i == 0:
+            # Decode ONLY the final iamge to save time
+            ims = vae.decode(xt)
+        else:
+            ims = xt
+        ims = torch.clamp(ims, -1., 1.).detach().cpu()
+        ims = (ims + 1) / 2
+        grid = make_grid(ims, nrow=train_config.num_grid_rows)
+        img = torchvision.transforms.ToPILImage()(grid)
+        if not os.path.exists(os.path.join(train_config.task_name, 'samples')):
+            os.mkdir(os.path.join(train_config.task_name, 'samples'))
+        img.save(os.path.join(train_config.task_name, 'samples', 'x0_{}.png'.format(i)))
+        img.close()
+def infer(Config):
+    diffusion_config = Config.diffusion_params
+    dataset_config = Config.dataset_params
+    diffusion_model_config = Config.ldm_params
+    autoencoder_model_config = Config.autoencoder_params
+    train_config = Config.train_params
+    # Create the noise scheduler
+    scheduler = LinearNoiseScheduler(num_timesteps=diffusion_config.num_timesteps,
+                                     beta_start=diffusion_config.beta_start,
+                                     beta_end=diffusion_config.beta_end)
+    # scheduler = CosineNoiseScheduler(diffusion_config.num_timesteps)
+    model = Unet(im_channels=autoencoder_model_config.z_channels,
+                 model_config=diffusion_model_config).to(device)
+    model.eval()
+    if os.path.exists(os.path.join(train_config.task_name,
+                                   train_config.ldm_ckpt_name)):
+        print('Loaded unet checkpoint')
+        model.load_state_dict(torch.load(os.path.join(train_config.task_name,
+                                                      train_config.ldm_ckpt_name),
+                                         map_location=device))
+    # Create output directories
+    if not os.path.exists(train_config.task_name):
+        os.mkdir(train_config.task_name)
+    vae = VQVAE(im_channels=dataset_config.im_channels,
+                model_config=autoencoder_model_config).to(device)
+    vae.eval()
+    # Load vae if found
+    if os.path.exists(os.path.join(train_config.task_name,
+                                                    train_config.vqvae_autoencoder_ckpt_name)):
+        print('Loaded vae checkpoint')
+        vae.load_state_dict(torch.load(os.path.join(train_config.task_name,
+                                                    train_config.vqvae_autoencoder_ckpt_name),
+                                       map_location=device), strict=True)
+    with torch.no_grad():
+        sample(model, scheduler, train_config, diffusion_model_config,
+               autoencoder_model_config, diffusion_config, dataset_config, vae)
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser(description="Choose between train VAE, train LDM, or infer mode.")
+    parser.add_argument('--mode', choices=['train_vae', 'train_ldm', 'infer'], default='infer',
+                        help="Mode to run: train_vae, train_ldm, or infer")
+    return parser.parse_args()
+args = get_args()
+if args.mode == 'train_vae':
+    trainVAE(Config)
+elif args.mode == 'train_ldm':
+    trainLDM(Config)
+else:
+    infer(Config)
+# python _5.2_ldm_celeba_hair_cosine.py --mode train_vae
+# python _5.2_ldm_celeba_hair_cosine.py --mode train_ldm
+# python _5.2_ldm_celeba_hair_cosine.py --mode infer
+# import matplotlib.pyplot as plt
+# from PIL import Image
+# # plt.style.use('dark_background')
+# # %matplotlib inline
+# plt.imshow(Image.open('/home/taruntejaneurips23/Ashish/DDPM/mnist_ldm/samples/x0_0.png'), cmap='gray')
+# import matplotlib.pyplot as plt
+# import matplotlib.image as mpimg
+# dataset_name = 'animeface_ldm'
+# image_paths = [f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_0.png',
+#                f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_1.png',
+#                f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_5.png',
+#                f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_100.png',
+#                f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_200.png'
+#             ]
+# fig, axes = plt.subplots(1, len(image_paths), figsize=(15, 5))
+# for i, path in enumerate(image_paths):
+#     img = mpimg.imread(path)
+#     axes[i].imshow(img)
+#     axes[i].axis('off')  # Hide axes
+#     axes[i].set_title(f't = {path.split("/")[-1].split(".")[0].split("_")[-1]}')
+# plt.tight_layout()
+# plt.show()
+# ---------------------------------------------------------
+# ---------- T H E - E N D  -------------------------------
+# ---------------------------------------------------------
+def save_checkpoint(
+    total_steps, epoch, model, discriminator,
+    optimizer_d, optimizer_g, loss, checkpoint_path
+):
+    checkpoint = {
+        "total_steps": total_steps,
+        "epoch": epoch,
+        "model_state_dict": model.state_dict(),
+        "discriminator_state_dict": discriminator.state_dict(),
+        "optimizer_d_state_dict": optimizer_d.state_dict(),
+        "optimizer_g_state_dict": optimizer_g.state_dict(),
+        "loss": loss,
+    }
+    torch.save(checkpoint, checkpoint_path)
+    print(f"Checkpoint saved after {total_steps} steps at epoch {epoch}")
+def load_checkpoint(
+    checkpoint_path, model, discriminator, optimizer_d, optimizer_g
+):
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        discriminator.load_state_dict(checkpoint["discriminator_state_dict"])
+        optimizer_d.load_state_dict(checkpoint["optimizer_d_state_dict"])
+        optimizer_g.load_state_dict(checkpoint["optimizer_g_state_dict"])
+        total_steps = checkpoint["total_steps"]
+        start_epoch = checkpoint["epoch"] + 1
+        loss = checkpoint["loss"]
+        print(f"Checkpoint loaded. Resuming from epoch {start_epoch}")
+        return total_steps, start_epoch, loss
+    else:
+        print("No checkpoint found. Starting from scratch.")
+        return 0, 0, None
+def trainVAE(Config, dataloader):
+    """
+    Trains a VQVAE model using the provided configuration and data loader.
+    """
+    # --- Configurations ----------------------------------------------------
+    dataset_config = Config.dataset_params
+    autoencoder_config = Config.autoencoder_params
+    train_config = Config.train_params
+    seed = train_config.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    if device == "cuda":
+        torch.cuda.manual_seed_all(seed)
+    # --- Model Initialization ----------------------------------------------
+    model = VQVAE(im_channels=dataset_config.im_channels, model_config=autoencoder_config).to(device)
+    discriminator = Discriminator(im_channels=dataset_config.im_channels).to(device)
+    # --- Load Checkpoints --------------------------------------------------
+    checkpoint_path = os.path.join(train_config.task_name, "vqvae_checkpoint.pth")
+    total_steps, start_epoch, _ = load_checkpoint(checkpoint_path, model, discriminator, None, None)
+    # --- Loss Function Initialization --------------------------------------
+    recon_criterion = torch.nn.MSELoss()
+    lpips_model = LPIPS().eval().to(device)
+    disc_criterion = torch.nn.MSELoss()
+    # --- Optimizer Initialization ------------------------------------------
+    optimizer_d = torch.optim.AdamW(discriminator.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    optimizer_g = torch.optim.AdamW(model.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    num_epochs = train_config.autoencoder_epochs
+    acc_steps = train_config.autoencoder_acc_steps
+    image_save_steps = train_config.autoencoder_img_save_steps
+    img_save_count = 0
+    # Create necessary directories
+    os.makedirs(os.path.join(train_config.task_name, "vqvae_autoencoder_samples"), exist_ok=True)
+    # --- Training Loop -----------------------------------------------------
+    for epoch_idx in range(start_epoch, num_epochs):
+        recon_losses, codebook_losses, perceptual_losses, disc_losses, gen_losses = [], [], [], [], []
+        for images in dataloader:
+            total_steps += 1
+            images = images.to(device)
+            # Forward pass
+            model_output = model(images)
+            output, z, quantize_losses = model_output
+            # Save generated images periodically
+            if total_steps % image_save_steps == 0 or total_steps == 1:
+                sample_size = min(8, images.shape[0])
+                save_output = torch.clamp(output[:sample_size], -1.0, 1.0).detach().cpu()
+                save_output = (save_output + 1) / 2
+                save_input = ((images[:sample_size] + 1) / 2).detach().cpu()
+                grid = make_grid(torch.cat([save_input, save_output], dim=0), nrow=sample_size)
+                img = tv.transforms.ToPILImage()(grid)
+                img.save(
+                    os.path.join(
+                        train_config.task_name,
+                        "vqvae_autoencoder_samples",
+                        f"current_autoencoder_sample_{img_save_count}.png",
+                    )
+                )
+                img_save_count += 1
+                img.close()
+            # Reconstruction Loss
+            recon_loss = recon_criterion(output, images) / acc_steps
+            recon_losses.append(recon_loss.item())
+            # Generator Loss
+            codebook_loss = train_config.codebook_weight * quantize_losses["codebook_loss"] / acc_steps
+            perceptual_loss = train_config.perceptual_weight * lpips_model(output, images).mean() / acc_steps
+            g_loss = recon_loss + codebook_loss + perceptual_loss
+            if total_steps > train_config.disc_start:
+                disc_fake_pred = discriminator(output)
+                gen_loss = train_config.disc_weight * disc_criterion(
+                    disc_fake_pred, torch.ones_like(disc_fake_pred)
+                ) / acc_steps
+                g_loss += gen_loss
+                gen_losses.append(gen_loss.item())
+            g_loss.backward()
+            optimizer_g.step()
+            optimizer_g.zero_grad()
+            # Discriminator Loss
+            if total_steps > train_config.disc_start:
+                disc_fake_pred = discriminator(output.detach())
+                disc_real_pred = discriminator(images)
+                disc_fake_loss = disc_criterion(
+                    disc_fake_pred, torch.zeros_like(disc_fake_pred)
+                ) / acc_steps
+                disc_real_loss = disc_criterion(
+                    disc_real_pred, torch.ones_like(disc_real_pred)
+                ) / acc_steps
+                disc_loss = train_config.disc_weight * (disc_fake_loss + disc_real_loss) / 2
+                disc_loss.backward()
+                optimizer_d.step()
+                optimizer_d.zero_grad()
+                disc_losses.append(disc_loss.item())
+        # Save checkpoint after each epoch
+        save_checkpoint(total_steps, epoch_idx, model, discriminator, optimizer_d, optimizer_g, recon_losses, checkpoint_path)
+        # Print epoch summary
+        print(
+            f"Epoch {epoch_idx + 1}/{num_epochs} | Recon Loss: {np.mean(recon_losses):.4f} | "
+            f"Perceptual Loss: {np.mean(perceptual_losses):.4f} | Codebook Loss: {np.mean(codebook_losses):.4f} | "
+            f"G Loss: {np.mean(gen_losses):.4f} | D Loss: {np.mean(disc_losses):.4f}"
+        )

LDM/scripts/_1_Lpips.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# ==================================================================
+#        LEARNED PERCEPTUAL IMAGE PATCH SIMILARITY ( L P I P S )
+# ==================================================================
+# Author    : Ashish Kumar Uchadiya
+# Created   : January 18, 2025
+# Description: LPIPS essentially computes the similarity between the
+# activations of two image patches for some pre-defined network.
+# This measure has been shown to match human perception well.
+# A low LPIPS score means that image patches are perceptual similar.
+# ==================================================================
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = torchvision.models.vgg16(
+            weights=torchvision.models.VGG16_Weights.IMAGENET1K_V1
+            ).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        # Freeze vgg model
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        # Return output of vgg features
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out

LDM/scripts/config.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+dataset_params:
+  im_path: "/home/taruntejaneurips23/Ashish/datasets/CelebA/img_align_celeba/img_align_celeba"
+  im_channels: 3
+  im_size: 28
+diffusion_params:
+  num_timesteps: 1000
+  beta_start: 0.0015
+  beta_end: 0.0195
+ldm_params:
+  down_channels: [128, 256, 256, 256]
+  mid_channels: [256, 256]
+  down_sample: [False, False, False]
+  attn_down: [True, True, True]
+  time_emb_dim: 256
+  norm_channels: 32
+  num_heads: 16
+  conv_out_channels: 128
+  num_down_layers: 2
+  num_mid_layers: 2
+  num_up_layers: 2
+autoencoder_params:
+  z_channels: 3
+  codebook_size: 20
+  down_channels: [32, 64, 128]
+  mid_channels: [128, 128]
+  down_sample: [True, True]
+  attn_down: [False, False]
+  norm_channels: 32
+  num_heads: 16
+  num_down_layers: 2
+  num_mid_layers: 2
+  num_up_layers: 2
+train_params:
+  seed: 4242
+  task_name: 'MnistLDM'
+  ldm_batch_size: 9
+  autoencoder_batch_size: 32
+  disc_start: 1000
+  disc_weight: 0.5
+  codebook_weight: 1
+  commitment_beta: 0.2
+  perceptual_weight: 1
+  kl_weight: 0.000005
+  ldm_epochs: 10
+  autoencoder_epochs: 10
+  num_samples: 9
+  num_grid_rows: 3
+  ldm_lr: 0.00001
+  autoencoder_lr: 0.0001
+  autoencoder_acc_steps: 1
+  autoencoder_img_save_steps: 8
+  save_latents: True
+  vqvae_latent_dir_name: 'vqvae_latents'
+  ldm_ckpt_name: 'ddpm_ckpt.pth'
+  vqvae_autoencoder_ckpt_name: 'vqvae_autoencoder_ckpt.pth'
+  vqvae_discriminator_ckpt_name: 'vqvae_discriminator_ckpt.pth'
+  checkpoint_dir: './'
+training:
+  _continue_: True

Vaani/39448.err ADDED Viewed

	@@ -0,0 +1,351 @@

++ '[' -z '' ']'
++ case "$-" in
++ __lmod_vx=x
++ '[' -n x ']'
++ set +x
+Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for this output (/usr/share/lmod/lmod/init/bash)
+Shell debugging restarted
++ unset __lmod_vx
++ cd
++ module purge
++ '[' -z '' ']'
++ case "$-" in
++ __lmod_sh_dbg=x
++ '[' -n x ']'
++ set +x
+Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output
+Shell debugging restarted
++ unset __lmod_sh_dbg
++ return 0
++ module load miniconda
++ '[' -z '' ']'
++ case "$-" in
++ __lmod_sh_dbg=x
++ '[' -n x ']'
++ set +x
+Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output
+Shell debugging restarted
++ unset __lmod_sh_dbg
++ return 0
++ source /home/apps/miniconda3/etc/profile.d/conda.sh
+++ export CONDA_EXE=/home/apps/miniconda3/bin/conda
+++ CONDA_EXE=/home/apps/miniconda3/bin/conda
+++ export _CE_M=
+++ _CE_M=
+++ export _CE_CONDA=
+++ _CE_CONDA=
+++ export CONDA_PYTHON_EXE=/home/apps/miniconda3/bin/python
+++ CONDA_PYTHON_EXE=/home/apps/miniconda3/bin/python
+++ '[' -z x ']'
++ conda env list
++ local cmd=env
++ case "$cmd" in
++ __conda_exe env list
++ '[' -n '' ']'
++ /home/apps/miniconda3/bin/conda env list
++ conda activate aku_env
++ local cmd=activate
++ case "$cmd" in
++ __conda_activate activate aku_env
++ '[' -n '' ']'
++ local ask_conda
+++ PS1=
+++ __conda_exe shell.posix activate aku_env
+++ '[' -n '' ']'
+++ /home/apps/miniconda3/bin/conda shell.posix activate aku_env
++ ask_conda='unset _CE_M
+unset _CE_CONDA
+PS1='\''(aku_env) '\''
+export PATH='\''/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/cli/servers/Stable-ddc367ed5c8936efe395cffeec279b04ffd7db78/server/bin/remote-cli:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/apps/MLDL/DL-CondaPy3/condabin:/home/IITB/ai-at-ieor/23m1521/.local/bin:/home/IITB/ai-at-ieor/23m1521/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/var/lib/snapd/snap/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/extensions/ms-python.debugpy-2025.4.1-linux-x64/bundled/scripts/noConfigScripts:/home/IITB/ai-at-ieor/23m1521/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand'\''
+export CONDA_PREFIX='\''/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env'\''
+export CONDA_SHLVL='\''2'\''
+export CONDA_DEFAULT_ENV='\''aku_env'\''
+export CONDA_PROMPT_MODIFIER='\''(aku_env) '\''
+export CONDA_PREFIX_1='\''/home/apps/miniconda3'\''
+export CONDA_EXE='\''/home/apps/miniconda3/bin/conda'\''
+export CONDA_PYTHON_EXE='\''/home/apps/miniconda3/bin/python'\''
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/gdal-activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/geotiff-activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libarrow_activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libglib_activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libpdal-core_activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libxml2_activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/pdal-python-activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/proj4-activate.sh"'
++ eval 'unset _CE_M
+unset _CE_CONDA
+PS1='\''(aku_env) '\''
+export PATH='\''/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/cli/servers/Stable-ddc367ed5c8936efe395cffeec279b04ffd7db78/server/bin/remote-cli:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/apps/MLDL/DL-CondaPy3/condabin:/home/IITB/ai-at-ieor/23m1521/.local/bin:/home/IITB/ai-at-ieor/23m1521/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/var/lib/snapd/snap/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/extensions/ms-python.debugpy-2025.4.1-linux-x64/bundled/scripts/noConfigScripts:/home/IITB/ai-at-ieor/23m1521/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand'\''
+export CONDA_PREFIX='\''/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env'\''
+export CONDA_SHLVL='\''2'\''
+export CONDA_DEFAULT_ENV='\''aku_env'\''
+export CONDA_PROMPT_MODIFIER='\''(aku_env) '\''
+export CONDA_PREFIX_1='\''/home/apps/miniconda3'\''
+export CONDA_EXE='\''/home/apps/miniconda3/bin/conda'\''
+export CONDA_PYTHON_EXE='\''/home/apps/miniconda3/bin/python'\''
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/gdal-activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/geotiff-activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libarrow_activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libglib_activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libpdal-core_activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libxml2_activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/pdal-python-activate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/proj4-activate.sh"'
+++ unset _CE_M
+++ unset _CE_CONDA
+++ PS1='(aku_env) '
+++ export PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/cli/servers/Stable-ddc367ed5c8936efe395cffeec279b04ffd7db78/server/bin/remote-cli:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/apps/MLDL/DL-CondaPy3/condabin:/home/IITB/ai-at-ieor/23m1521/.local/bin:/home/IITB/ai-at-ieor/23m1521/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/var/lib/snapd/snap/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/extensions/ms-python.debugpy-2025.4.1-linux-x64/bundled/scripts/noConfigScripts:/home/IITB/ai-at-ieor/23m1521/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
+++ PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/cli/servers/Stable-ddc367ed5c8936efe395cffeec279b04ffd7db78/server/bin/remote-cli:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/apps/MLDL/DL-CondaPy3/condabin:/home/IITB/ai-at-ieor/23m1521/.local/bin:/home/IITB/ai-at-ieor/23m1521/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/var/lib/snapd/snap/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/extensions/ms-python.debugpy-2025.4.1-linux-x64/bundled/scripts/noConfigScripts:/home/IITB/ai-at-ieor/23m1521/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
+++ export CONDA_PREFIX=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env
+++ CONDA_PREFIX=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env
+++ export CONDA_SHLVL=2
+++ CONDA_SHLVL=2
+++ export CONDA_DEFAULT_ENV=aku_env
+++ CONDA_DEFAULT_ENV=aku_env
+++ export 'CONDA_PROMPT_MODIFIER=(aku_env) '
+++ CONDA_PROMPT_MODIFIER='(aku_env) '
+++ export CONDA_PREFIX_1=/home/apps/miniconda3
+++ CONDA_PREFIX_1=/home/apps/miniconda3
+++ export CONDA_EXE=/home/apps/miniconda3/bin/conda
+++ CONDA_EXE=/home/apps/miniconda3/bin/conda
+++ export CONDA_PYTHON_EXE=/home/apps/miniconda3/bin/python
+++ CONDA_PYTHON_EXE=/home/apps/miniconda3/bin/python
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/gdal-activate.sh
++++ '[' -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal ']'
++++ export _CONDA_SET_GDAL_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal
++++ _CONDA_SET_GDAL_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal
++++ '[' -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins ']'
++++ export _CONDA_SET_GDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins
++++ _CONDA_SET_GDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins
++++ '[' -d /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal ']'
++++ export GDAL_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal
++++ GDAL_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal
++++ export GDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins
++++ GDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins
++++ '[' '!' -d /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins ']'
++++ export CPL_ZIP_ENCODING=UTF-8
++++ CPL_ZIP_ENCODING=UTF-8
++++ '[' -n '4.4.20(1)-release' ']'
++++ '[' -f /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/bash-completion/completions/gdalinfo ']'
++++ source /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/bash-completion/completions/gdalinfo
+++++ function_exists _get_comp_words_by_ref
+++++ declare -f -F _get_comp_words_by_ref
+++++ return 1
+++++ return 0
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/geotiff-activate.sh
++++ '[' -n '' ']'
++++ '[' -d /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/epsg_csv ']'
++++ '[' -d /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/Library/share/epsg_csv ']'
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libarrow_activate.sh
++++ '[' -n '' ']'
++++ _la_log 'Beginning libarrow activation.'
++++ '[' '' = 1 ']'
++++ _la_gdb_prefix=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load
++++ '[' '!' -w /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load ']'
++++ _la_placeholder=replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX
++++ _la_symlink_dir=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load//home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib
++++ _la_orig_install_dir=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load/replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX/lib
++++ _la_log '          _la_gdb_prefix: /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load'
++++ '[' '' = 1 ']'
++++ _la_log '         _la_placeholder: replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX'
++++ '[' '' = 1 ']'
++++ _la_log '         _la_symlink_dir: /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load//home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib'
++++ '[' '' = 1 ']'
++++ _la_log '    _la_orig_install_dir: /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load/replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX/lib'
++++ '[' '' = 1 ']'
++++ _la_log '  content of that folder:'
++++ '[' '' = 1 ']'
+++++ ls -al /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load/replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX/lib
+++++ sed 's/^/      /'
++++ _la_log '      total 12
+      drwxr-sr-x 2 23m1521 ai-at-ieor 4096 Mar 23 19:37 .
+      drwxr-sr-x 3 23m1521 ai-at-ieor 4096 Mar 22 19:59 ..
+      -rw-r--r-- 1 23m1521 ai-at-ieor  992 Mar 23 19:36 libarrow.so.1900.1.0-gdb.py'
++++ '[' '' = 1 ']'
++++ for _la_target in "$_la_orig_install_dir/"*.py
++++ '[' '!' -e /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load/replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX/lib/libarrow.so.1900.1.0-gdb.py ']'
+++++ basename /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load/replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX/lib/libarrow.so.1900.1.0-gdb.py
++++ _la_symlink=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load//home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/libarrow.so.1900.1.0-gdb.py
++++ _la_log '   _la_target: /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load/replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX/lib/libarrow.so.1900.1.0-gdb.py'
++++ '[' '' = 1 ']'
++++ _la_log '  _la_symlink: /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load//home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/libarrow.so.1900.1.0-gdb.py'
++++ '[' '' = 1 ']'
++++ '[' -L /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load//home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/libarrow.so.1900.1.0-gdb.py ']'
+++++ readlink /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load//home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/libarrow.so.1900.1.0-gdb.py
++++ '[' /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load/replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX/lib/libarrow.so.1900.1.0-gdb.py = /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdb/auto-load/replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX/lib/libarrow.so.1900.1.0-gdb.py ']'
++++ _la_log 'symlink $_la_symlink already exists and points to $_la_target, skipping.'
++++ '[' '' = 1 ']'
++++ continue
++++ _la_log 'Libarrow activation complete.'
++++ '[' '' = 1 ']'
++++ unset _la_gdb_prefix
++++ unset _la_log
++++ unset _la_orig_install_dir
++++ unset _la_placeholder
++++ unset _la_symlink
++++ unset _la_symlink_dir
++++ unset _la_target
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libglib_activate.sh
++++ export GSETTINGS_SCHEMA_DIR_CONDA_BACKUP=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/glib-2.0/schemas
++++ GSETTINGS_SCHEMA_DIR_CONDA_BACKUP=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/glib-2.0/schemas
++++ export GSETTINGS_SCHEMA_DIR=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/glib-2.0/schemas
++++ GSETTINGS_SCHEMA_DIR=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/glib-2.0/schemas
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libpdal-core_activate.sh
++++ '[' -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/python3.12/site-packages/pdal ']'
++++ export _CONDA_SET_PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/python3.12/site-packages/pdal
++++ _CONDA_SET_PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/python3.12/site-packages/pdal
++++ export PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib
++++ PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib
++++ '[' '!' -d /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib ']'
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/libxml2_activate.sh
++++ test -n 'file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog'
++++ xml_catalog_files_libxml2='file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog'
++++ XML_CATALOG_FILES='file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog '
++++ conda_catalog_files=
++++ ifs_libxml2='
+'
++++ IFS=' '
++++ rem=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env
++++ for pre in ${rem}
++++ test '' = /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env
++++ conda_catalog_files=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env
++++ rem=
++++ IFS='
+'
++++ conda_catalog_files='file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog'
++++ export 'XML_CATALOG_FILES=file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog'
++++ XML_CATALOG_FILES='file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog'
++++ unset conda_catalog_files ifs_libxml2 rem
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/pdal-python-activate.sh
++++ [[ -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib ]]
++++ export _CONDA_SET_PDAL_PYTHON_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib
++++ _CONDA_SET_PDAL_PYTHON_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib
++++ export PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/python3.12/site-packages/pdal
++++ PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/python3.12/site-packages/pdal
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/activate.d/proj4-activate.sh
++++ '[' -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj ']'
++++ export _CONDA_SET_PROJ_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj
++++ _CONDA_SET_PROJ_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj
++++ '[' -d /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj ']'
++++ export PROJ_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj
++++ PROJ_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj
++++ '[' -f /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj/copyright_and_licenses.csv ']'
++++ export PROJ_NETWORK=ON
++++ PROJ_NETWORK=ON
++ __conda_hashr
++ '[' -n '' ']'
++ '[' -n '' ']'
++ hash -r
++ python /home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/image_data_metadata.py
++ conda deactivate
++ local cmd=deactivate
++ case "$cmd" in
++ __conda_activate deactivate
++ '[' -n '' ']'
++ local ask_conda
+++ PS1='(aku_env) '
+++ __conda_exe shell.posix deactivate
+++ '[' -n '' ']'
+++ /home/apps/miniconda3/bin/conda shell.posix deactivate
++ ask_conda='export PATH='\''/home/apps/miniconda3/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/cli/servers/Stable-ddc367ed5c8936efe395cffeec279b04ffd7db78/server/bin/remote-cli:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/apps/MLDL/DL-CondaPy3/condabin:/home/IITB/ai-at-ieor/23m1521/.local/bin:/home/IITB/ai-at-ieor/23m1521/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/var/lib/snapd/snap/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/extensions/ms-python.debugpy-2025.4.1-linux-x64/bundled/scripts/noConfigScripts:/home/IITB/ai-at-ieor/23m1521/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand'\''
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/proj4-deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/pdal-python-deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libxml2_deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libpdal-core_deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libglib_deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/geotiff-deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/gdal-deactivate.sh"
+unset CONDA_PREFIX_1
+unset _CE_M
+unset _CE_CONDA
+PS1='\''(base) '\''
+export CONDA_PREFIX='\''/home/apps/miniconda3'\''
+export CONDA_SHLVL='\''1'\''
+export CONDA_DEFAULT_ENV='\''base'\''
+export CONDA_PROMPT_MODIFIER='\''(base) '\''
+export CONDA_EXE='\''/home/apps/miniconda3/bin/conda'\''
+export CONDA_PYTHON_EXE='\''/home/apps/miniconda3/bin/python'\'''
++ eval 'export PATH='\''/home/apps/miniconda3/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/cli/servers/Stable-ddc367ed5c8936efe395cffeec279b04ffd7db78/server/bin/remote-cli:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/apps/MLDL/DL-CondaPy3/condabin:/home/IITB/ai-at-ieor/23m1521/.local/bin:/home/IITB/ai-at-ieor/23m1521/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/var/lib/snapd/snap/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/extensions/ms-python.debugpy-2025.4.1-linux-x64/bundled/scripts/noConfigScripts:/home/IITB/ai-at-ieor/23m1521/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand'\''
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/proj4-deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/pdal-python-deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libxml2_deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libpdal-core_deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libglib_deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/geotiff-deactivate.sh"
+. "/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/gdal-deactivate.sh"
+unset CONDA_PREFIX_1
+unset _CE_M
+unset _CE_CONDA
+PS1='\''(base) '\''
+export CONDA_PREFIX='\''/home/apps/miniconda3'\''
+export CONDA_SHLVL='\''1'\''
+export CONDA_DEFAULT_ENV='\''base'\''
+export CONDA_PROMPT_MODIFIER='\''(base) '\''
+export CONDA_EXE='\''/home/apps/miniconda3/bin/conda'\''
+export CONDA_PYTHON_EXE='\''/home/apps/miniconda3/bin/python'\'''
+++ export PATH=/home/apps/miniconda3/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/cli/servers/Stable-ddc367ed5c8936efe395cffeec279b04ffd7db78/server/bin/remote-cli:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/apps/MLDL/DL-CondaPy3/condabin:/home/IITB/ai-at-ieor/23m1521/.local/bin:/home/IITB/ai-at-ieor/23m1521/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/var/lib/snapd/snap/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/extensions/ms-python.debugpy-2025.4.1-linux-x64/bundled/scripts/noConfigScripts:/home/IITB/ai-at-ieor/23m1521/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
+++ PATH=/home/apps/miniconda3/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/cli/servers/Stable-ddc367ed5c8936efe395cffeec279b04ffd7db78/server/bin/remote-cli:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/bin:/home/apps/MLDL/DL-CondaPy3/condabin:/home/IITB/ai-at-ieor/23m1521/.local/bin:/home/IITB/ai-at-ieor/23m1521/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/var/lib/snapd/snap/bin:/home/IITB/ai-at-ieor/23m1521/.vscode-server/extensions/ms-python.debugpy-2025.4.1-linux-x64/bundled/scripts/noConfigScripts:/home/IITB/ai-at-ieor/23m1521/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/proj4-deactivate.sh
++++ unset PROJ_DATA
++++ unset PROJ_NETWORK
++++ '[' -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj ']'
++++ export PROJ_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj
++++ PROJ_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/proj
++++ unset _CONDA_SET_PROJ_DATA
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/pdal-python-deactivate.sh
++++ [[ -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib ]]
++++ export PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib
++++ PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib
++++ unset _CONDA_SET_PDAL_PYTHON_DRIVER_PATH
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libxml2_deactivate.sh
++++ test -n 'file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog'
++++ export 'XML_CATALOG_FILES=file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog'
++++ XML_CATALOG_FILES='file:///home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/xml/catalog file:///etc/xml/catalog'
++++ unset xml_catalog_files_libxml2
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libpdal-core_deactivate.sh
++++ unset PDAL_DRIVER_PATH
++++ '[' -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/python3.12/site-packages/pdal ']'
++++ export PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/python3.12/site-packages/pdal
++++ PDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib:/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/python3.12/site-packages/pdal
++++ unset _CONDA_SET_PDAL_DRIVER_PATH
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/libglib_deactivate.sh
++++ export GSETTINGS_SCHEMA_DIR=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/glib-2.0/schemas
++++ GSETTINGS_SCHEMA_DIR=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/glib-2.0/schemas
++++ unset GSETTINGS_SCHEMA_DIR_CONDA_BACKUP
++++ '[' -z /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/glib-2.0/schemas ']'
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/geotiff-deactivate.sh
++++ unset GEOTIFF_CSV
++++ '[' -n '' ']'
+++ . /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/etc/conda/deactivate.d/gdal-deactivate.sh
++++ unset GDAL_DATA
++++ '[' -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal ']'
++++ export GDAL_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal
++++ GDAL_DATA=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/share/gdal
++++ unset _CONDA_SET_GDAL_DATA
++++ unset GDAL_DRIVER_PATH
++++ '[' -n /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins ']'
++++ export GDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins
++++ GDAL_DRIVER_PATH=/home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env/lib/gdalplugins
++++ unset _CONDA_SET_GDAL_DRIVER_PATH
++++ unset CPL_ZIP_ENCODING
+++ unset CONDA_PREFIX_1
+++ unset _CE_M
+++ unset _CE_CONDA
+++ PS1='(base) '
+++ export CONDA_PREFIX=/home/apps/miniconda3
+++ CONDA_PREFIX=/home/apps/miniconda3
+++ export CONDA_SHLVL=1
+++ CONDA_SHLVL=1
+++ export CONDA_DEFAULT_ENV=base
+++ CONDA_DEFAULT_ENV=base
+++ export 'CONDA_PROMPT_MODIFIER=(base) '
+++ CONDA_PROMPT_MODIFIER='(base) '
+++ export CONDA_EXE=/home/apps/miniconda3/bin/conda
+++ CONDA_EXE=/home/apps/miniconda3/bin/conda
+++ export CONDA_PYTHON_EXE=/home/apps/miniconda3/bin/python
+++ CONDA_PYTHON_EXE=/home/apps/miniconda3/bin/python
++ __conda_hashr
++ '[' -n '' ']'
++ '[' -n '' ']'
++ hash -r

Vaani/39448.out ADDED Viewed

	@@ -0,0 +1,11 @@

+# conda environments:
+#
+aku_env                /home/IITB/ai-at-ieor/23m1521/.conda/envs/aku_env
+cuml                   /home/IITB/ai-at-ieor/23m1521/.conda/envs/cuml
+base                 * /home/apps/miniconda3
+SCA_deepspeed          /home/apps/miniconda3/envs/SCA_deepspeed
+llama2                 /home/apps/miniconda3/envs/llama2
+tutorial               /home/apps/miniconda3/envs/tutorial
+Results saved to /home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/image_dimensions_count.csv

Vaani/IISc_VaaniProject_M_AP_Anantpur_00014520_1544240000_APATSR_190315_1880_16300.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:903b573851ab7767554050c6b238964660511571352084706283a2db802ffb35
+size 462726

Vaani/LDM/__init__.py ADDED Viewed

File without changes

Vaani/LDM/notebooks/Vaani-subplot.png ADDED Viewed

Git LFS Details

SHA256: 3b22fe2de54d1a38e517def2bd26d83b5eb1279237f02b7652cc4480530492a5
Pointer size: 132 Bytes
Size of remote file: 8.94 MB

Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-15_16.png ADDED Viewed

Git LFS Details

SHA256: 7a8b412940e4342bd00636ca00b292e82530e39ae1f3dfb8d8993004a7ba9973
Pointer size: 131 Bytes
Size of remote file: 959 kB

Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-30_16.png ADDED Viewed

Git LFS Details

SHA256: 98cafe70cfb371e0ed61091c479c920ff6de0b8d29c46c7190a1c254244828e3
Pointer size: 131 Bytes
Size of remote file: 968 kB

Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-4.png ADDED Viewed

Git LFS Details

SHA256: 4ba9a1004033edbe03c1a0d9e1672fc0f4f966968f08ac6e535db52531f0ec14
Pointer size: 131 Bytes
Size of remote file: 491 kB

Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-5.png ADDED Viewed

Git LFS Details

SHA256: ef43308df008221144827461f26bacca11f6c9a0d17970129f076b1a95ab630e
Pointer size: 131 Bytes
Size of remote file: 488 kB

Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-6.png ADDED Viewed

Git LFS Details

SHA256: 4d909e9e6de6cec642bf668f451e38ae021854250ae86353eaa030afbc95529b
Pointer size: 131 Bytes
Size of remote file: 491 kB

Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-6_16.png ADDED Viewed

Git LFS Details

SHA256: 6b16b33674cc9d2c6618828519b07de8553835c825bf248131efc054a67d277d
Pointer size: 131 Bytes
Size of remote file: 967 kB

Vaani/LDM/notebooks/Vaani_VQVAE_Recon_Images/reconstructed_images_EP-8_16.png ADDED Viewed

Git LFS Details

SHA256: da223574afe8fe5860b17e229f23b94b04d7acbd7f9409270c7f301f5a72bcac
Pointer size: 131 Bytes
Size of remote file: 971 kB

Vaani/LDM/notebooks/_1_Main.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Vaani/LDM/notebooks/_2_Rough-LPIPS.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Vaani/LDM/scripts/AE-training.log ADDED Viewed

@@ -0,0 +1,126 @@
  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/32201 [00:00<?, ?it/s][A
  0%|          | 1/32201 [00:04<42:33:43,  4.76s/it][A
  0%|          | 2/32201 [00:05<19:42:29,  2.20s/it][A
  0%|          | 3/32201 [00:05<13:55:59,  1.56s/it][A
  0%|          | 4/32201 [00:06<11:09:27,  1.25s/it][A
  0%|          | 5/32201 [00:07<9:39:35,  1.08s/it] [A
  0%|          | 6/32201 [00:08<8:45:29,  1.02it/s][A
  0%|          | 7/32201 [00:09<8:11:20,  1.09it/s][A
  0%|          | 8/32201 [00:09<7:48:52,  1.14it/s][A
  0%|          | 9/32201 [00:10<7:33:28,  1.18it/s][A
  0%|          | 10/32201 [00:11<7:22:44,  1.21it/s][A
  0%|          | 11/32201 [00:12<7:15:53,  1.23it/s][A
  0%|          | 12/32201 [00:13<7:11:41,  1.24it/s][A
  0%|          | 13/32201 [00:13<7:08:13,  1.25it/s][A
  0%|          | 14/32201 [00:14<7:05:54,  1.26it/s][A
  0%|          | 15/32201 [00:15<7:05:03,  1.26it/s][A
  0%|          | 16/32201 [00:16<7:02:58,  1.27it/s][A
  0%|          | 17/32201 [00:16<7:01:54,  1.27it/s][A
  0%|          | 18/32201 [00:17<7:01:32,  1.27it/s][A
  0%|          | 19/32201 [00:18<7:01:28,  1.27it/s][A
  0%|          | 20/32201 [00:19<7:00:58,  1.27it/s][A
  0%|          | 21/32201 [00:20<7:00:49,  1.27it/s][A
  0%|          | 22/32201 [00:20<7:00:22,  1.28it/s][A
  0%|          | 23/32201 [00:21<7:00:16,  1.28it/s][A
  0%|          | 24/32201 [00:22<7:00:04,  1.28it/s][A
  0%|          | 25/32201 [00:23<7:00:03,  1.28it/s][A
  0%|          | 26/32201 [00:23<7:00:35,  1.27it/s][A
  0%|          | 27/32201 [00:24<7:00:58,  1.27it/s][A
  0%|          | 28/32201 [00:25<7:01:00,  1.27it/s][A
  0%|          | 29/32201 [00:26<7:01:05,  1.27it/s][A
  0%|          | 30/32201 [00:27<7:00:31,  1.28it/s][A
  0%|          | 31/32201 [00:27<7:00:28,  1.28it/s][A
  0%|          | 32/32201 [00:28<7:00:15,  1.28it/s][A
  0%|          | 33/32201 [00:29<7:00:04,  1.28it/s][A
  0%|          | 34/32201 [00:30<7:00:28,  1.28it/s][A
  0%|          | 35/32201 [00:31<7:00:09,  1.28it/s][A
  0%|          | 36/32201 [00:31<6:59:58,  1.28it/s][A
  0%|          | 37/32201 [00:32<7:00:04,  1.28it/s][A
  0%|          | 38/32201 [00:33<7:00:27,  1.27it/s][A
  0%|          | 39/32201 [00:34<7:00:33,  1.27it/s][A
  0%|          | 40/32201 [00:34<7:00:21,  1.28it/s][A
  0%|          | 41/32201 [00:35<7:00:25,  1.27it/s][A
  0%|          | 42/32201 [00:36<7:00:46,  1.27it/s][A
  0%|          | 43/32201 [00:37<7:00:35,  1.27it/s][A
  0%|          | 44/32201 [00:38<7:00:56,  1.27it/s][A
  0%|          | 45/32201 [00:38<7:00:58,  1.27it/s][A
  0%|          | 46/32201 [00:39<7:01:10,  1.27it/s][A
  0%|          | 47/32201 [00:40<7:01:06,  1.27it/s][A
  0%|          | 48/32201 [00:41<7:00:54,  1.27it/s][A
  0%|          | 49/32201 [00:42<7:01:07,  1.27it/s][A
  0%|          | 50/32201 [00:42<7:01:11,  1.27it/s][A
  0%|          | 51/32201 [00:43<7:01:03,  1.27it/s][A
  0%|          | 52/32201 [00:44<7:00:58,  1.27it/s][A
  0%|          | 53/32201 [00:45<7:01:05,  1.27it/s][A
  0%|          | 54/32201 [00:45<7:00:55,  1.27it/s][A
  0%|          | 55/32201 [00:46<7:01:12,  1.27it/s][A
  0%|          | 56/32201 [00:47<7:01:30,  1.27it/s][A
  0%|          | 57/32201 [00:48<7:01:27,  1.27it/s][A
  0%|          | 58/32201 [00:49<7:01:30,  1.27it/s][A
  0%|          | 59/32201 [00:49<7:02:07,  1.27it/s][A
  0%|          | 59/32201 [00:50<7:36:56,  1.17it/s]
  0%|          | 0/3 [00:50<?, ?it/s]

+TIME: 2025-03-25 01:30:39.070253
+DEVICE: cuda
+{'autoencoder_params': {'attn_down': [False, False],
+                        'codebook_size': 20,
+                        'down_channels': [32, 64, 128],
+                        'down_sample': [True, True],
+                        'mid_channels': [128, 128],
+                        'norm_channels': 32,
+                        'num_down_layers': 4,
+                        'num_heads': 16,
+                        'num_mid_layers': 4,
+                        'num_up_layers': 4,
+                        'z_channels': 3},
+ 'dataset_params': {'im_channels': 3,
+                    'im_path': '/home/taruntejaneurips23/Ashish/datasets/CelebA/img_align_celeba/img_align_celeba',
+                    'im_size': 256},
+ 'diffusion_params': {'beta_end': 0.0195, 'beta_start': 0.0015, 'num_timesteps': 1000},
+ 'ldm_params': {'attn_down': [True, True, True],
+                'conv_out_channels': 128,
+                'down_channels': [128, 256, 256, 256],
+                'down_sample': [False, False, False],
+                'mid_channels': [256, 256],
+                'norm_channels': 32,
+                'num_down_layers': 2,
+                'num_heads': 16,
+                'num_mid_layers': 2,
+                'num_up_layers': 2,
+                'time_emb_dim': 256},
+ 'train_params': {'autoencoder_acc_steps': 1,
+                  'autoencoder_batch_size': 4,
+                  'autoencoder_epochs': 3,
+                  'autoencoder_img_save_steps': 8,
+                  'autoencoder_lr': 0.0001,
+                  'checkpoint_dir': './',
+                  'codebook_weight': 1,
+                  'commitment_beta': 0.2,
+                  'disc_start': 1000,
+                  'disc_weight': 0.5,
+                  'kl_weight': 5e-06,
+                  'ldm_batch_size': 1,
+                  'ldm_ckpt_name': 'ddpm_ckpt.pth',
+                  'ldm_epochs': 10,
+                  'ldm_lr': 1e-05,
+                  'num_grid_rows': 3,
+                  'num_samples': 9,
+                  'perceptual_weight': 1,
+                  'save_latents': True,
+                  'seed': 4422,
+                  'task_name': 'VaaniLDM',
+                  'vqvae_autoencoder_ckpt_name': 'vqvae_autoencoder_ckpt.pth',
+                  'vqvae_discriminator_ckpt_name': 'vqvae_discriminator_ckpt.pth',
+                  'vqvae_latent_dir_name': 'vqvae_latents'},
+ 'training': {'_continue_': True}}
+Files found: 128807
+IMAGE SHAPE: torch.Size([3, 256, 256])
+BATCH SHAPE: torch.Size([4, 3, 256, 256])
+No checkpoint found. Starting from scratch.
  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/32201 [00:00<?, ?it/s][A
  0%|          | 1/32201 [00:04<42:33:43,  4.76s/it][A
  0%|          | 2/32201 [00:05<19:42:29,  2.20s/it][A
  0%|          | 3/32201 [00:05<13:55:59,  1.56s/it][A
  0%|          | 4/32201 [00:06<11:09:27,  1.25s/it][A
  0%|          | 5/32201 [00:07<9:39:35,  1.08s/it] [A
  0%|          | 6/32201 [00:08<8:45:29,  1.02it/s][A
  0%|          | 7/32201 [00:09<8:11:20,  1.09it/s][A
  0%|          | 8/32201 [00:09<7:48:52,  1.14it/s][A
  0%|          | 9/32201 [00:10<7:33:28,  1.18it/s][A
  0%|          | 10/32201 [00:11<7:22:44,  1.21it/s][A
  0%|          | 11/32201 [00:12<7:15:53,  1.23it/s][A
  0%|          | 12/32201 [00:13<7:11:41,  1.24it/s][A
  0%|          | 13/32201 [00:13<7:08:13,  1.25it/s][A
  0%|          | 14/32201 [00:14<7:05:54,  1.26it/s][A
  0%|          | 15/32201 [00:15<7:05:03,  1.26it/s][A
  0%|          | 16/32201 [00:16<7:02:58,  1.27it/s][A
  0%|          | 17/32201 [00:16<7:01:54,  1.27it/s][A
  0%|          | 18/32201 [00:17<7:01:32,  1.27it/s][A
  0%|          | 19/32201 [00:18<7:01:28,  1.27it/s][A
  0%|          | 20/32201 [00:19<7:00:58,  1.27it/s][A
  0%|          | 21/32201 [00:20<7:00:49,  1.27it/s][A
  0%|          | 22/32201 [00:20<7:00:22,  1.28it/s][A
  0%|          | 23/32201 [00:21<7:00:16,  1.28it/s][A
  0%|          | 24/32201 [00:22<7:00:04,  1.28it/s][A
  0%|          | 25/32201 [00:23<7:00:03,  1.28it/s][A
  0%|          | 26/32201 [00:23<7:00:35,  1.27it/s][A
  0%|          | 27/32201 [00:24<7:00:58,  1.27it/s][A
  0%|          | 28/32201 [00:25<7:01:00,  1.27it/s][A
  0%|          | 29/32201 [00:26<7:01:05,  1.27it/s][A
  0%|          | 30/32201 [00:27<7:00:31,  1.28it/s][A
  0%|          | 31/32201 [00:27<7:00:28,  1.28it/s][A
  0%|          | 32/32201 [00:28<7:00:15,  1.28it/s][A
  0%|          | 33/32201 [00:29<7:00:04,  1.28it/s][A
  0%|          | 34/32201 [00:30<7:00:28,  1.28it/s][A
  0%|          | 35/32201 [00:31<7:00:09,  1.28it/s][A
  0%|          | 36/32201 [00:31<6:59:58,  1.28it/s][A
  0%|          | 37/32201 [00:32<7:00:04,  1.28it/s][A
  0%|          | 38/32201 [00:33<7:00:27,  1.27it/s][A
  0%|          | 39/32201 [00:34<7:00:33,  1.27it/s][A
  0%|          | 40/32201 [00:34<7:00:21,  1.28it/s][A
  0%|          | 41/32201 [00:35<7:00:25,  1.27it/s][A
  0%|          | 42/32201 [00:36<7:00:46,  1.27it/s][A
  0%|          | 43/32201 [00:37<7:00:35,  1.27it/s][A
  0%|          | 44/32201 [00:38<7:00:56,  1.27it/s][A
  0%|          | 45/32201 [00:38<7:00:58,  1.27it/s][A
  0%|          | 46/32201 [00:39<7:01:10,  1.27it/s][A
  0%|          | 47/32201 [00:40<7:01:06,  1.27it/s][A
  0%|          | 48/32201 [00:41<7:00:54,  1.27it/s][A
  0%|          | 49/32201 [00:42<7:01:07,  1.27it/s][A
  0%|          | 50/32201 [00:42<7:01:11,  1.27it/s][A
  0%|          | 51/32201 [00:43<7:01:03,  1.27it/s][A
  0%|          | 52/32201 [00:44<7:00:58,  1.27it/s][A
  0%|          | 53/32201 [00:45<7:01:05,  1.27it/s][A
  0%|          | 54/32201 [00:45<7:00:55,  1.27it/s][A
  0%|          | 55/32201 [00:46<7:01:12,  1.27it/s][A
  0%|          | 56/32201 [00:47<7:01:30,  1.27it/s][A
  0%|          | 57/32201 [00:48<7:01:27,  1.27it/s][A
  0%|          | 58/32201 [00:49<7:01:30,  1.27it/s][A
  0%|          | 59/32201 [00:49<7:02:07,  1.27it/s][A
  0%|          | 59/32201 [00:50<7:36:56,  1.17it/s]
  0%|          | 0/3 [00:50<?, ?it/s]
+Traceback (most recent call last):
+  File "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/LDM/scripts/Vaani-VQVAE-Main.py", line 1105, in <module>
+    trainVAE(Config, dataloader)
+  File "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/LDM/scripts/Vaani-VQVAE-Main.py", line 1049, in trainVAE
+    images = images.to(device)
+             ^^^^^^^^^^^^^^^^^
+KeyboardInterrupt

Vaani/LDM/scripts/Main.py ADDED Viewed

	@@ -0,0 +1,2303 @@

+# ==================================================================
+#         L A T E N T   D I F F U S I O N   M O D E L
+# ==================================================================
+# Author    : Ashish Kumar Uchadiya
+# Created   : November 3, 2024
+# Description: This script implements a Latent Diffusion Model using
+# a cosine or linear noise scheduling approach for high-resolution
+# image generation. The model leverages generative techniques to
+# learn a latent representation and progressively reduce noise to
+# generate clear, realistic images.
+# ==================================================================
+#                         I M P O R T S
+# ==================================================================
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+"""Lpips"""
+# from __future__ import absolute_import
+from collections import namedtuple
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from torch.autograd import Variable
+import numpy as np
+import torch.nn
+import torchvision
+# Taken from https://github.com/richzhang/PerceptualSimilarity/blob/master/lpips/lpips.py
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def spatial_average(in_tens, keepdim=True):
+    return in_tens.mean([2, 3], keepdim=keepdim)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = torchvision.models.vgg16(
+            weights=torchvision.models.VGG16_Weights.IMAGENET1K_V1
+        ).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        # Freeze vgg model
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        # Return output of vgg features
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+# Learned perceptual metric
+class LPIPS(nn.Module):
+    def __init__(self, net='vgg', version='0.1', use_dropout=True):
+        super(LPIPS, self).__init__()
+        self.version = version
+        # Imagenet normalization
+        self.scaling_layer = ScalingLayer()
+        ########################
+        # Instantiate vgg model
+        self.chns = [64, 128, 256, 512, 512]
+        self.L = len(self.chns)
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        # Add 1x1 convolutional Layers
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        self.lins = nn.ModuleList(self.lins)
+        ########################
+        # Load the weights of trained LPIPS model
+        import inspect
+        import os
+        # /home/taruntejaneurips23/.cache/torch/hub/checkpoints/vgg16-397923af.pth
+        print(os.path.abspath(os.path.join(inspect.getfile(self.__init__), '..', 'weights/v%s/%s.pth' % (version, net))))
+        # model_path = os.path.abspath(
+        #     os.path.join(inspect.getfile(self.__init__), '..', 'weights/v%s/%s.pth' % (version, net)))
+        # print('Loading model from: %s' % model_path)
+        # self.load_state_dict(torch.load(model_path, map_location=device), strict=False)
+        ########################
+        # Freeze all parameters
+        self.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+        ########################
+    def forward(self, in0, in1, normalize=False):
+        # Scale the inputs to -1 to +1 range if needed
+        if normalize:  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
+            in0 = 2 * in0 - 1
+            in1 = 2 * in1 - 1
+        ########################
+        # Normalize the inputs according to imagenet normalization
+        in0_input, in1_input = self.scaling_layer(in0), self.scaling_layer(in1)
+        ########################
+        # Get VGG outputs for image0 and image1
+        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        ########################
+        # Compute Square of Difference for each layer output
+        for kk in range(self.L):
+            feats0[kk], feats1[kk] = torch.nn.functional.normalize(outs0[kk], dim=1), torch.nn.functional.normalize(
+                outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        ########################
+        # 1x1 convolution followed by spatial average on the square differences
+        res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
+        val = 0
+        # Aggregate the results of each layer
+        for l in range(self.L):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        # Imagnet normalization for (0-1)
+        # mean = [0.485, 0.456, 0.406]
+        # std = [0.229, 0.224, 0.225]
+        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    ''' A single linear layer which does a 1x1 conv '''
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = [nn.Dropout(), ] if (use_dropout) else []
+        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        out = self.model(x)
+        return out
+"""Blocks"""
+import torch
+import numpy as np
+class LinearNoiseScheduler:
+    r"""
+    Class for the linear noise scheduler that is used in DDPM.
+    """
+    def __init__(self, num_timesteps, beta_start, beta_end):
+        self.num_timesteps = num_timesteps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        # Mimicking how compvis repo creates schedule
+        self.betas = (
+                torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_timesteps) ** 2
+        )
+        self.alphas = 1. - self.betas
+        self.alpha_cum_prod = torch.cumprod(self.alphas, dim=0)
+        self.sqrt_alpha_cum_prod = torch.sqrt(self.alpha_cum_prod)
+        self.sqrt_one_minus_alpha_cum_prod = torch.sqrt(1 - self.alpha_cum_prod)
+    def add_noise(self, original, noise, t):
+        r"""
+        Forward method for diffusion
+        :param original: Image on which noise is to be applied
+        :param noise: Random Noise Tensor (from normal dist)
+        :param t: timestep of the forward process of shape -> (B,)
+        :return:
+        """
+        original_shape = original.shape
+        batch_size = original_shape[0]
+        sqrt_alpha_cum_prod = self.sqrt_alpha_cum_prod.to(original.device)[t].reshape(batch_size)
+        sqrt_one_minus_alpha_cum_prod = self.sqrt_one_minus_alpha_cum_prod.to(original.device)[t].reshape(batch_size)
+        # Reshape till (B,) becomes (B,1,1,1) if image is (B,C,H,W)
+        for _ in range(len(original_shape) - 1):
+            sqrt_alpha_cum_prod = sqrt_alpha_cum_prod.unsqueeze(-1)
+        for _ in range(len(original_shape) - 1):
+            sqrt_one_minus_alpha_cum_prod = sqrt_one_minus_alpha_cum_prod.unsqueeze(-1)
+        # Apply and Return Forward process equation
+        return (sqrt_alpha_cum_prod.to(original.device) * original
+                + sqrt_one_minus_alpha_cum_prod.to(original.device) * noise)
+    def sample_prev_timestep(self, xt, noise_pred, t):
+        r"""
+            Use the noise prediction by model to get
+            xt-1 using xt and the nosie predicted
+        :param xt: current timestep sample
+        :param noise_pred: model noise prediction
+        :param t: current timestep we are at
+        :return:
+        """
+        x0 = ((xt - (self.sqrt_one_minus_alpha_cum_prod.to(xt.device)[t] * noise_pred)) /
+              torch.sqrt(self.alpha_cum_prod.to(xt.device)[t]))
+        x0 = torch.clamp(x0, -1., 1.)
+        mean = xt - ((self.betas.to(xt.device)[t]) * noise_pred) / (self.sqrt_one_minus_alpha_cum_prod.to(xt.device)[t])
+        mean = mean / torch.sqrt(self.alphas.to(xt.device)[t])
+        if t == 0:
+            return mean, x0
+        else:
+            variance = (1 - self.alpha_cum_prod.to(xt.device)[t - 1]) / (1.0 - self.alpha_cum_prod.to(xt.device)[t])
+            variance = variance * self.betas.to(xt.device)[t]
+            sigma = variance ** 0.5
+            z = torch.randn(xt.shape).to(xt.device)
+            # OR
+            # variance = self.betas[t]
+            # sigma = variance ** 0.5
+            # z = torch.randn(xt.shape).to(xt.device)
+            return mean + sigma * z, x0
+import torch
+import math
+class CosineNoiseScheduler:
+    r"""
+    Class for the cosine noise scheduler, often used in DDPM-based models.
+    """
+    def __init__(self, num_timesteps, s=0.008):
+        self.num_timesteps = num_timesteps
+        self.s = s
+        # Cosine schedule based on paper
+        def cosine_schedule(t):
+            return math.cos((t / self.num_timesteps + s) / (1 + s) * math.pi / 2) ** 2
+        # Compute alphas
+        self.alphas = torch.tensor([cosine_schedule(t) for t in range(num_timesteps)])
+        self.alpha_cum_prod = torch.cumprod(self.alphas, dim=0)
+        self.sqrt_alpha_cum_prod = torch.sqrt(self.alpha_cum_prod)
+        self.sqrt_one_minus_alpha_cum_prod = torch.sqrt(1 - self.alpha_cum_prod)
+    def add_noise(self, original, noise, t):
+        original_shape = original.shape
+        batch_size = original_shape[0]
+        sqrt_alpha_cum_prod = self.sqrt_alpha_cum_prod.to(original.device)[t].reshape(batch_size)
+        sqrt_one_minus_alpha_cum_prod = self.sqrt_one_minus_alpha_cum_prod.to(original.device)[t].reshape(batch_size)
+        for _ in range(len(original_shape) - 1):
+            sqrt_alpha_cum_prod = sqrt_alpha_cum_prod.unsqueeze(-1)
+        for _ in range(len(original_shape) - 1):
+            sqrt_one_minus_alpha_cum_prod = sqrt_one_minus_alpha_cum_prod.unsqueeze(-1)
+        return (sqrt_alpha_cum_prod * original + sqrt_one_minus_alpha_cum_prod * noise)
+    def sample_prev_timestep(self, xt, noise_pred, t):
+        x0 = ((xt - (self.sqrt_one_minus_alpha_cum_prod.to(xt.device)[t] * noise_pred)) /
+              torch.sqrt(self.alpha_cum_prod.to(xt.device)[t]))
+        x0 = torch.clamp(x0, -1., 1.)
+        mean = xt - ((1 - self.alphas.to(xt.device)[t]) * noise_pred) / (self.sqrt_one_minus_alpha_cum_prod.to(xt.device)[t])
+        mean = mean / torch.sqrt(self.alphas.to(xt.device)[t])
+        if t == 0:
+            return mean, x0
+        else:
+            variance = (1 - self.alpha_cum_prod.to(xt.device)[t - 1]) / (1.0 - self.alpha_cum_prod.to(xt.device)[t])
+            variance = variance * (1 - self.alphas.to(xt.device)[t])
+            sigma = variance ** 0.5
+            z = torch.randn(xt.shape).to(xt.device)
+            return mean + sigma * z, x0
+import torch
+import torch.nn as nn
+def get_time_embedding(time_steps, temb_dim):
+    r"""
+    Convert time steps tensor into an embedding using the
+    sinusoidal time embedding formula
+    :param time_steps: 1D tensor of length batch size
+    :param temb_dim: Dimension of the embedding
+    :return: BxD embedding representation of B time steps
+    """
+    assert temb_dim % 2 == 0, "time embedding dimension must be divisible by 2"
+    # factor = 10000^(2i/d_model)
+    factor = 10000 ** ((torch.arange(
+        start=0, end=temb_dim // 2, dtype=torch.float32, device=time_steps.device) / (temb_dim // 2))
+    )
+    # pos / factor
+    # timesteps B -> B, 1 -> B, temb_dim
+    t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor
+    t_emb = torch.cat([torch.sin(t_emb), torch.cos(t_emb)], dim=-1)
+    return t_emb
+class DownBlock(nn.Module):
+    r"""
+    Down conv block with attention.
+    Sequence of following block
+    1. Resnet block with time embedding
+    2. Attention block
+    3. Downsample
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim,
+                 down_sample, num_heads, num_layers, attn, norm_channels, cross_attn=False, context_dim=None):
+        super().__init__()
+        self.num_layers = num_layers
+        self.down_sample = down_sample
+        self.attn = attn
+        self.context_dim = context_dim
+        self.cross_attn = cross_attn
+        self.t_emb_dim = t_emb_dim
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels,
+                              kernel_size=3, stride=1, padding=1),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(self.t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers)
+            ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels,
+                              kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if self.attn:
+            self.attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers)
+            ]
+        )
+        self.down_sample_conv = nn.Conv2d(out_channels, out_channels,
+                                          4, 2, 1) if self.down_sample else nn.Identity()
+    def forward(self, x, t_emb=None, context=None):
+        out = x
+        for i in range(self.num_layers):
+            # Resnet block of Unet
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            if self.attn:
+                # Attention block of Unet
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+            if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        # Downsample
+        out = self.down_sample_conv(out)
+        return out
+class MidBlock(nn.Module):
+    r"""
+    Mid conv block with attention.
+    Sequence of following blocks
+    1. Resnet block with time embedding
+    2. Attention block
+    3. Resnet block with time embedding
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim, num_heads, num_layers, norm_channels, cross_attn=None, context_dim=None):
+        super().__init__()
+        self.num_layers = num_layers
+        self.t_emb_dim = t_emb_dim
+        self.context_dim = context_dim
+        self.cross_attn = cross_attn
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
+                              padding=1),
+                )
+                for i in range(num_layers + 1)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers + 1)
+            ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers + 1)
+            ]
+        )
+        self.attention_norms = nn.ModuleList(
+            [nn.GroupNorm(norm_channels, out_channels)
+             for _ in range(num_layers)]
+        )
+        self.attentions = nn.ModuleList(
+            [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+             for _ in range(num_layers)]
+        )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers + 1)
+            ]
+        )
+    def forward(self, x, t_emb=None, context=None):
+        out = x
+        # First resnet block
+        resnet_input = out
+        out = self.resnet_conv_first[0](out)
+        if self.t_emb_dim is not None:
+            out = out + self.t_emb_layers[0](t_emb)[:, :, None, None]
+        out = self.resnet_conv_second[0](out)
+        out = out + self.residual_input_conv[0](resnet_input)
+        for i in range(self.num_layers):
+            # Attention Block
+            batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
+            in_attn = self.attention_norms[i](in_attn)
+            in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+            out = out + out_attn
+            if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+            # Resnet Block
+            resnet_input = out
+            out = self.resnet_conv_first[i + 1](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i + 1](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i + 1](out)
+            out = out + self.residual_input_conv[i + 1](resnet_input)
+        return out
+class UpBlock(nn.Module):
+    r"""
+    Up conv block with attention.
+    Sequence of following blocks
+    1. Upsample
+    1. Concatenate Down block output
+    2. Resnet block with time embedding
+    3. Attention Block
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim,
+                 up_sample, num_heads, num_layers, attn, norm_channels):
+        super().__init__()
+        self.num_layers = num_layers
+        self.up_sample = up_sample
+        self.t_emb_dim = t_emb_dim
+        self.attn = attn
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
+                              padding=1),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers)
+            ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if self.attn:
+            self.attention_norms = nn.ModuleList(
+                [
+                    nn.GroupNorm(norm_channels, out_channels)
+                    for _ in range(num_layers)
+                ]
+            )
+            self.attentions = nn.ModuleList(
+                [
+                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                    for _ in range(num_layers)
+                ]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers)
+            ]
+        )
+        self.up_sample_conv = nn.ConvTranspose2d(in_channels, in_channels,
+                                                 4, 2, 1) \
+            if self.up_sample else nn.Identity()
+    def forward(self, x, out_down=None, t_emb=None):
+        # Upsample
+        x = self.up_sample_conv(x)
+        # Concat with Downblock output
+        if out_down is not None:
+            x = torch.cat([x, out_down], dim=1)
+        out = x
+        for i in range(self.num_layers):
+            # Resnet Block
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            # Self Attention
+            if self.attn:
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        return out
+class UpBlockUnet(nn.Module):
+    r"""
+    Up conv block with attention.
+    Sequence of following blocks
+    1. Upsample
+    1. Concatenate Down block output
+    2. Resnet block with time embedding
+    3. Attention Block
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim, up_sample,
+                 num_heads, num_layers, norm_channels, cross_attn=False, context_dim=None):
+        super().__init__()
+        self.num_layers = num_layers
+        self.up_sample = up_sample
+        self.t_emb_dim = t_emb_dim
+        self.cross_attn = cross_attn
+        self.context_dim = context_dim
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
+                              padding=1),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers)
+            ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.attention_norms = nn.ModuleList(
+            [
+                nn.GroupNorm(norm_channels, out_channels)
+                for _ in range(num_layers)
+            ]
+        )
+        self.attentions = nn.ModuleList(
+            [
+                nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                for _ in range(num_layers)
+            ]
+        )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers)
+            ]
+        )
+        self.up_sample_conv = nn.ConvTranspose2d(in_channels // 2, in_channels // 2,
+                                                 4, 2, 1) \
+            if self.up_sample else nn.Identity()
+    def forward(self, x, out_down=None, t_emb=None, context=None):
+        x = self.up_sample_conv(x)
+        if out_down is not None:
+            x = torch.cat([x, out_down], dim=1)
+        out = x
+        for i in range(self.num_layers):
+            # Resnet
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            # Self Attention
+            batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
+            in_attn = self.attention_norms[i](in_attn)
+            in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+            out = out + out_attn
+            # Cross Attention
+            if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert len(context.shape) == 3, \
+                    "Context shape does not match B,_,CONTEXT_DIM"
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim,\
+                    "Context shape does not match B,_,CONTEXT_DIM"
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        return out
+"""Vqvae"""
+import torch
+import torch.nn as nn
+class VQVAE(nn.Module):
+    def __init__(self, im_channels, model_config):
+        super().__init__()
+        self.down_channels = model_config.down_channels
+        self.mid_channels = model_config.mid_channels
+        self.down_sample = model_config.down_sample
+        self.num_down_layers = model_config.num_down_layers
+        self.num_mid_layers = model_config.num_mid_layers
+        self.num_up_layers = model_config.num_up_layers
+        # To disable attention in Downblock of Encoder and Upblock of Decoder
+        self.attns = model_config.attn_down
+        # Latent Dimension
+        self.z_channels = model_config.z_channels
+        self.codebook_size = model_config.codebook_size
+        self.norm_channels = model_config.norm_channels
+        self.num_heads = model_config.num_heads
+        # Assertion to validate the channel information
+        assert self.mid_channels[0] == self.down_channels[-1]
+        assert self.mid_channels[-1] == self.down_channels[-1]
+        assert len(self.down_sample) == len(self.down_channels) - 1
+        assert len(self.attns) == len(self.down_channels) - 1
+        # Wherever we use downsampling in encoder correspondingly use
+        # upsampling in decoder
+        self.up_sample = list(reversed(self.down_sample))
+        ##################### Encoder ######################
+        self.encoder_conv_in = nn.Conv2d(im_channels, self.down_channels[0], kernel_size=3, padding=(1, 1))
+        # Downblock + Midblock
+        self.encoder_layers = nn.ModuleList([])
+        for i in range(len(self.down_channels) - 1):
+            self.encoder_layers.append(DownBlock(self.down_channels[i], self.down_channels[i + 1],
+                                                 t_emb_dim=None, down_sample=self.down_sample[i],
+                                                 num_heads=self.num_heads,
+                                                 num_layers=self.num_down_layers,
+                                                 attn=self.attns[i],
+                                                 norm_channels=self.norm_channels))
+        self.encoder_mids = nn.ModuleList([])
+        for i in range(len(self.mid_channels) - 1):
+            self.encoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i + 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.encoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[-1])
+        self.encoder_conv_out = nn.Conv2d(self.down_channels[-1], self.z_channels, kernel_size=3, padding=1)
+        # Pre Quantization Convolution
+        self.pre_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        # Codebook
+        self.embedding = nn.Embedding(self.codebook_size, self.z_channels)
+        ####################################################
+        ##################### Decoder ######################
+        # Post Quantization Convolution
+        self.post_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        self.decoder_conv_in = nn.Conv2d(self.z_channels, self.mid_channels[-1], kernel_size=3, padding=(1, 1))
+        # Midblock + Upblock
+        self.decoder_mids = nn.ModuleList([])
+        for i in reversed(range(1, len(self.mid_channels))):
+            self.decoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i - 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.decoder_layers = nn.ModuleList([])
+        for i in reversed(range(1, len(self.down_channels))):
+            self.decoder_layers.append(UpBlock(self.down_channels[i], self.down_channels[i - 1],
+                                               t_emb_dim=None, up_sample=self.down_sample[i - 1],
+                                               num_heads=self.num_heads,
+                                               num_layers=self.num_up_layers,
+                                               attn=self.attns[i-1],
+                                               norm_channels=self.norm_channels))
+        self.decoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[0])
+        self.decoder_conv_out = nn.Conv2d(self.down_channels[0], im_channels, kernel_size=3, padding=1)
+    def quantize(self, x):
+        B, C, H, W = x.shape
+        # B, C, H, W -> B, H, W, C
+        x = x.permute(0, 2, 3, 1)
+        # B, H, W, C -> B, H*W, C
+        x = x.reshape(x.size(0), -1, x.size(-1))
+        # Find nearest embedding/codebook vector
+        # dist between (B, H*W, C) and (B, K, C) -> (B, H*W, K)
+        dist = torch.cdist(x, self.embedding.weight[None, :].repeat((x.size(0), 1, 1)))
+        # (B, H*W)
+        min_encoding_indices = torch.argmin(dist, dim=-1)
+        # Replace encoder output with nearest codebook
+        # quant_out -> B*H*W, C
+        quant_out = torch.index_select(self.embedding.weight, 0, min_encoding_indices.view(-1))
+        # x -> B*H*W, C
+        x = x.reshape((-1, x.size(-1)))
+        commmitment_loss = torch.mean((quant_out.detach() - x) ** 2)
+        codebook_loss = torch.mean((quant_out - x.detach()) ** 2)
+        quantize_losses = {
+            'codebook_loss': codebook_loss,
+            'commitment_loss': commmitment_loss
+        }
+        # Straight through estimation
+        quant_out = x + (quant_out - x).detach()
+        # quant_out -> B, C, H, W
+        quant_out = quant_out.reshape((B, H, W, C)).permute(0, 3, 1, 2)
+        min_encoding_indices = min_encoding_indices.reshape((-1, quant_out.size(-2), quant_out.size(-1)))
+        return quant_out, quantize_losses, min_encoding_indices
+    def encode(self, x):
+        out = self.encoder_conv_in(x)
+        for idx, down in enumerate(self.encoder_layers):
+            out = down(out)
+        for mid in self.encoder_mids:
+            out = mid(out)
+        out = self.encoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.encoder_conv_out(out)
+        out = self.pre_quant_conv(out)
+        out, quant_losses, _ = self.quantize(out)
+        return out, quant_losses
+    def decode(self, z):
+        out = z
+        out = self.post_quant_conv(out)
+        out = self.decoder_conv_in(out)
+        for mid in self.decoder_mids:
+            out = mid(out)
+        for idx, up in enumerate(self.decoder_layers):
+            out = up(out)
+        out = self.decoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.decoder_conv_out(out)
+        return out
+    def forward(self, x):
+        z, quant_losses = self.encode(x)
+        out = self.decode(z)
+        return out, z, quant_losses
+"""Vae"""
+import torch
+import torch.nn as nn
+class VAE(nn.Module):
+    def __init__(self, im_channels, model_config):
+        super().__init__()
+        self.down_channels = model_config['down_channels']
+        self.mid_channels = model_config['mid_channels']
+        self.down_sample = model_config['down_sample']
+        self.num_down_layers = model_config['num_down_layers']
+        self.num_mid_layers = model_config['num_mid_layers']
+        self.num_up_layers = model_config['num_up_layers']
+        # To disable attention in Downblock of Encoder and Upblock of Decoder
+        self.attns = model_config['attn_down']
+        # Latent Dimension
+        self.z_channels = model_config['z_channels']
+        self.norm_channels = model_config['norm_channels']
+        self.num_heads = model_config['num_heads']
+        # Assertion to validate the channel information
+        assert self.mid_channels[0] == self.down_channels[-1]
+        assert self.mid_channels[-1] == self.down_channels[-1]
+        assert len(self.down_sample) == len(self.down_channels) - 1
+        assert len(self.attns) == len(self.down_channels) - 1
+        # Wherever we use downsampling in encoder correspondingly use
+        # upsampling in decoder
+        self.up_sample = list(reversed(self.down_sample))
+        ##################### Encoder ######################
+        self.encoder_conv_in = nn.Conv2d(im_channels, self.down_channels[0], kernel_size=3, padding=(1, 1))
+        # Downblock + Midblock
+        self.encoder_layers = nn.ModuleList([])
+        for i in range(len(self.down_channels) - 1):
+            self.encoder_layers.append(DownBlock(self.down_channels[i], self.down_channels[i + 1],
+                                                 t_emb_dim=None, down_sample=self.down_sample[i],
+                                                 num_heads=self.num_heads,
+                                                 num_layers=self.num_down_layers,
+                                                 attn=self.attns[i],
+                                                 norm_channels=self.norm_channels))
+        self.encoder_mids = nn.ModuleList([])
+        for i in range(len(self.mid_channels) - 1):
+            self.encoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i + 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.encoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[-1])
+        self.encoder_conv_out = nn.Conv2d(self.down_channels[-1], 2*self.z_channels, kernel_size=3, padding=1)
+        # Latent Dimension is 2*Latent because we are predicting mean & variance
+        self.pre_quant_conv = nn.Conv2d(2*self.z_channels, 2*self.z_channels, kernel_size=1)
+        ####################################################
+        ##################### Decoder ######################
+        self.post_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        self.decoder_conv_in = nn.Conv2d(self.z_channels, self.mid_channels[-1], kernel_size=3, padding=(1, 1))
+        # Midblock + Upblock
+        self.decoder_mids = nn.ModuleList([])
+        for i in reversed(range(1, len(self.mid_channels))):
+            self.decoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i - 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.decoder_layers = nn.ModuleList([])
+        for i in reversed(range(1, len(self.down_channels))):
+            self.decoder_layers.append(UpBlock(self.down_channels[i], self.down_channels[i - 1],
+                                               t_emb_dim=None, up_sample=self.down_sample[i - 1],
+                                               num_heads=self.num_heads,
+                                               num_layers=self.num_up_layers,
+                                               attn=self.attns[i - 1],
+                                               norm_channels=self.norm_channels))
+        self.decoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[0])
+        self.decoder_conv_out = nn.Conv2d(self.down_channels[0], im_channels, kernel_size=3, padding=1)
+    def encode(self, x):
+        out = self.encoder_conv_in(x)
+        for idx, down in enumerate(self.encoder_layers):
+            out = down(out)
+        for mid in self.encoder_mids:
+            out = mid(out)
+        out = self.encoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.encoder_conv_out(out)
+        out = self.pre_quant_conv(out)
+        mean, logvar = torch.chunk(out, 2, dim=1)
+        std = torch.exp(0.5 * logvar)
+        sample = mean + std * torch.randn(mean.shape).to(device=x.device)
+        return sample, out
+    def decode(self, z):
+        out = z
+        out = self.post_quant_conv(out)
+        out = self.decoder_conv_in(out)
+        for mid in self.decoder_mids:
+            out = mid(out)
+        for idx, up in enumerate(self.decoder_layers):
+            out = up(out)
+        out = self.decoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.decoder_conv_out(out)
+        return out
+    def forward(self, x):
+        z, encoder_output = self.encode(x)
+        out = self.decode(z)
+        return out, encoder_output
+"""Discriminator"""
+import torch
+import torch.nn as nn
+class Discriminator(nn.Module):
+    r"""
+    PatchGAN Discriminator.
+    Rather than taking IMG_CHANNELSxIMG_HxIMG_W all the way to
+    1 scalar value , we instead predict grid of values.
+    Where each grid is prediction of how likely
+    the discriminator thinks that the image patch corresponding
+    to the grid cell is real
+    """
+    def __init__(self, im_channels=3,
+                 conv_channels=[64, 128, 256],
+                 kernels=[4,4,4,4],
+                 strides=[2,2,2,1],
+                 paddings=[1,1,1,1]):
+        super().__init__()
+        self.im_channels = im_channels
+        activation = nn.LeakyReLU(0.2)
+        layers_dim = [self.im_channels] + conv_channels + [1]
+        self.layers = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(layers_dim[i], layers_dim[i + 1],
+                          kernel_size=kernels[i],
+                          stride=strides[i],
+                          padding=paddings[i],
+                          bias=False if i !=0 else True),
+                nn.BatchNorm2d(layers_dim[i + 1]) if i != len(layers_dim) - 2 and i != 0 else nn.Identity(),
+                activation if i != len(layers_dim) - 2 else nn.Identity()
+            )
+            for i in range(len(layers_dim) - 1)
+        ])
+    def forward(self, x):
+        out = x
+        for layer in self.layers:
+            out = layer(out)
+        return out
+# if __name__ == '__main__':
+#     x = torch.randn((2,3, 256, 256))
+#     prob = Discriminator(im_channels=3)(x)
+#     print(prob.shape)
+# import os
+# image_paths = [os.path.join("/home/taruntejaneurips23/Ashish/datasets/animefacedata/images", f)
+#                for f in os.listdir("/home/taruntejaneurips23/Ashish/datasets/animefacedata/images")]
+# image_paths
+import glob
+import os
+import torchvision
+from PIL import Image
+from tqdm import tqdm, trange
+# from utils.diffusion_utils import load_latents
+from torch.utils.data.dataset import Dataset
+import pickle
+import glob
+import os
+import torch
+def load_latents(latent_path):
+    r"""
+    Simple utility to save latents to speed up ldm training
+    :param latent_path:
+    :return:
+    """
+    latent_maps = {}
+    for fname in glob.glob(os.path.join(latent_path, '*.pkl')):
+        s = pickle.load(open(fname, 'rb'))
+        for k, v in s.items():
+            latent_maps[k] = v[0]
+    return latent_maps
+def drop_text_condition(text_embed, im, empty_text_embed, text_drop_prob):
+    if text_drop_prob > 0:
+        text_drop_mask = torch.zeros((im.shape[0]), device=im.device).float().uniform_(0,
+                                                                                       1) < text_drop_prob
+        assert empty_text_embed is not None, ("Text Conditioning required as well as"
+                                        " text dropping but empty text representation not created")
+        text_embed[text_drop_mask, :, :] = empty_text_embed[0]
+    return text_embed
+def drop_image_condition(image_condition, im, im_drop_prob):
+    if im_drop_prob > 0:
+        im_drop_mask = torch.zeros((im.shape[0], 1, 1, 1), device=im.device).float().uniform_(0,
+                                                                                        1) > im_drop_prob
+        return image_condition * im_drop_mask
+    else:
+        return image_condition
+def drop_class_condition(class_condition, class_drop_prob, im):
+    if class_drop_prob > 0:
+        class_drop_mask = torch.zeros((im.shape[0], 1), device=im.device).float().uniform_(0,
+                                                                                           1) > class_drop_prob
+        return class_condition * class_drop_mask
+    else:
+        return class_condition
+class MnistDataset(Dataset):
+    r"""
+    Nothing special here. Just a simple dataset class for mnist images.
+    Created a dataset class rather using torchvision to allow
+    replacement with any other image dataset
+    """
+    def __init__(self, split, im_path, im_size, im_channels,
+                 use_latents=False, latent_path=None, condition_config=None):
+        r"""
+        Init method for initializing the dataset properties
+        :param split: train/test to locate the image files
+        :param im_path: root folder of images
+        :param im_ext: image extension. assumes all
+        images would be this type.
+        """
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        # Should we use latents or not
+        self.latent_maps = None
+        self.use_latents = False
+        # Conditioning for the dataset
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.images, self.labels = self.load_images(im_path)
+        # Whether to load images and call vae or to load latents
+        if use_latents and latent_path is not None:
+            latent_maps = load_latents(latent_path)
+            if len(latent_maps) == len(self.images):
+                self.use_latents = True
+                self.latent_maps = latent_maps
+                print('Found {} latents'.format(len(self.latent_maps)))
+            else:
+                print('Latents not found')
+    def load_images(self, im_path):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        :param im_path:
+        :return:
+        """
+        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
+        ims = []
+        labels = []
+        for d_name in tqdm(os.listdir(im_path)):
+            fnames = glob.glob(os.path.join(im_path, d_name, '*.{}'.format('png')))
+            fnames += glob.glob(os.path.join(im_path, d_name, '*.{}'.format('jpg')))
+            fnames += glob.glob(os.path.join(im_path, d_name, '*.{}'.format('jpeg')))
+            for fname in fnames:
+                ims.append(fname)
+                if 'class' in self.condition_types:
+                    labels.append(int(d_name))
+        print('Found {} images for split {}'.format(len(ims), self.split))
+        return ims, labels
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        cond_inputs = {}
+        if 'class' in self.condition_types:
+            cond_inputs['class'] = self.labels[index]
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            else:
+                return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.ToTensor()(im)
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            else:
+                return im_tensor, cond_inputs
+class AnimeFaceDataset(Dataset):
+    def __init__(self, split, im_path, im_size, im_channels,
+                 use_latents=False, latent_path=None, condition_config=None):
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        # Should we use latents or not
+        self.latent_maps = None
+        self.use_latents = False
+        # Conditioning for the dataset
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.images = self.load_images(im_path)
+        # Whether to load images and call vae or to load latents
+        if use_latents and latent_path is not None:
+            latent_maps = load_latents(latent_path)
+            if len(latent_maps) == len(self.images):
+                self.use_latents = True
+                self.latent_maps = latent_maps
+                print('Found {} latents'.format(len(self.latent_maps)))
+            else:
+                print('Latents not found')
+    def load_images(self, im_path):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        :param im_path:
+        :return:
+        """
+        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
+        # ims = []
+        # labels = []
+        ims = [os.path.join(im_path, f) for f in os.listdir(im_path)]
+        return ims
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        # cond_inputs = {}
+        # if 'class' in self.condition_types:
+        #     cond_inputs['class'] = self.labels[index]
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            # else:
+            #     return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.Compose([
+                torchvision.transforms.Resize(self.im_size),
+                torchvision.transforms.CenterCrop(self.im_size),
+                torchvision.transforms.ToTensor(),
+            ])(im)
+            im.close()
+            # im_tensor = torchvision.transforms.ToTensor()(im)
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            # else:
+            #     return im_tensor, cond_inputs
+import glob
+import os
+import random
+import torch
+import torchvision
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data.dataset import Dataset
+class CelebDataset(Dataset):
+    def __init__(self, split, im_path, im_size, im_channels,
+                 use_latents=False, latent_path=None, condition_config=None):
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        # Should we use latents or not
+        self.latent_maps = None
+        self.use_latents = False
+        # Conditioning for the dataset
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.images = self.load_images(im_path)
+        # Whether to load images and call vae or to load latents
+        if use_latents and latent_path is not None:
+            latent_maps = load_latents(latent_path)
+            if len(latent_maps) == len(self.images):
+                self.use_latents = True
+                self.latent_maps = latent_maps
+                print('Found {} latents'.format(len(self.latent_maps)))
+            else:
+                print('Latents not found')
+    def load_images(self, im_path):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        :param im_path:
+        :return:
+        """
+        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
+        # ims = []
+        # labels = []
+        ims = [os.path.join(im_path, f) for f in os.listdir(im_path)]
+        return ims
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        # cond_inputs = {}
+        # if 'class' in self.condition_types:
+        #     cond_inputs['class'] = self.labels[index]
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            # else:
+            #     return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.Compose([
+                # torchvision.transforms.Resize(self.im_size),
+                torchvision.transforms.CenterCrop(self.im_size),
+                torchvision.transforms.ToTensor(),
+            ])(im)
+            im.close()
+            # im_tensor = torchvision.transforms.ToTensor()(im)
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            # else:
+            #     return im_tensor, cond_inputs
+import pandas as pd
+class CelebHairDataset(Dataset):
+    def __init__(self, split, im_path, im_size, im_channels,
+                 use_latents=False, latent_path=None, condition_config=None):
+        self.df = pd.read_csv("/home/taruntejaneurips23/Ashish/DDPM/hair_df_100.csv")
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        # Should we use latents or not
+        self.latent_maps = None
+        self.use_latents = False
+        # Conditioning for the dataset
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.images = self.load_images(im_path, self.df)
+        # Whether to load images and call vae or to load latents
+        if use_latents and latent_path is not None:
+            latent_maps = load_latents(latent_path)
+            if len(latent_maps) == len(self.images):
+                self.use_latents = True
+                self.latent_maps = latent_maps
+                print('Found {} latents'.format(len(self.latent_maps)))
+            else:
+                print('Latents not found')
+    def load_images(self, im_path, df):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        :param im_path:
+        :return:
+        """
+        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
+        # ims = []
+        # labels = []
+        # ims = [os.path.join(im_path, f) for f in os.listdir(im_path)]
+        ims = [os.path.join(im_path, i) for i in df.image_id.values]
+        return ims
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        # cond_inputs = {}
+        # if 'class' in self.condition_types:
+        #     cond_inputs['class'] = self.labels[index]
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            # else:
+            #     return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.Compose([
+                # torchvision.transforms.Resize(self.im_size),
+                torchvision.transforms.CenterCrop(self.im_size),
+                torchvision.transforms.ToTensor(),
+            ])(im)
+            im.close()
+            # im_tensor = torchvision.transforms.ToTensor()(im)
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            # else:
+            #     return im_tensor, cond_inputs
+#"""Train VQVAE"""...............................................................................................................................................
+# Commented out IPython magic to ensure Python compatibility.
+import torch
+import torch.nn as nn
+import yaml
+from ashish.MTP.Vaani.LDM.scripts.dotdict import DotDict
+config_path = "/home/taruntejaneurips23/Ashish/DDPM/_5_ldm_celeba.yaml"
+with open(config_path, 'r') as file:
+    Config = yaml.safe_load(file)
+Config = DotDict.from_dict(Config)
+dataset_config = Config.dataset_params
+diffusion_config = Config.diffusion_params
+model_config = Config.model_params
+train_config = Config.train_params
+import torch
+import os
+import random
+import numpy as np
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from torch.optim import Adam
+from torch.utils.data import Dataset, TensorDataset, DataLoader
+# device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+from torchvision.utils import make_grid
+def trainVAE(Config):
+    dataset_config = Config.dataset_params
+    autoencoder_config = Config.autoencoder_params
+    train_config = Config.train_params
+    # Set the desired seed value #
+    seed = train_config.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    if device == 'cuda':
+        torch.cuda.manual_seed_all(seed)
+    #############################
+    # Create the model and dataset #
+    model = VQVAE(im_channels=dataset_config.im_channels,
+                  model_config=autoencoder_config).to(device)
+    # model.load_state_dict(torch.load("/home/taruntejaneurips23/Ashish/DDPM/celebAhair_ldm/vqvae_autoencoder_ckpt.pth", map_location=device))
+    if os.path.exists(os.path.join(train_config.task_name, train_config.vqvae_autoencoder_ckpt_name)):
+        print('Loaded vae checkpoint')
+        model.load_state_dict(torch.load(os.path.join(train_config.task_name, train_config.vqvae_autoencoder_ckpt_name),
+                                         map_location=device, weights_only=True))
+    # Create the dataset
+    im_dataset_cls = {
+        'mnist': MnistDataset,
+        'celebA': CelebDataset,
+        'animeface': AnimeFaceDataset,
+        'celebAhair': CelebHairDataset
+    }.get(dataset_config.name)
+    im_dataset = im_dataset_cls(split='train',
+                                im_path=dataset_config.im_path,
+                                im_size=dataset_config.im_size,
+                                im_channels=dataset_config.im_channels)
+    data_loader = DataLoader(im_dataset,
+                             batch_size=train_config.autoencoder_batch_size,
+                             shuffle=True,
+                             num_workers=os.cpu_count(),
+                             pin_memory=True,
+                             drop_last=True,
+                             persistent_workers=True, pin_memory_device=device)
+    # Create output directories
+    if not os.path.exists(train_config.task_name):
+        os.mkdir(train_config.task_name)
+    num_epochs = train_config.autoencoder_epochs
+    # L1/L2 loss for Reconstruction
+    recon_criterion = torch.nn.MSELoss()
+    # Disc Loss can even be BCEWithLogits
+    disc_criterion = torch.nn.MSELoss()
+    # No need to freeze lpips as lpips.py takes care of that
+    lpips_model = LPIPS().eval().to(device)
+    discriminator = Discriminator(im_channels=dataset_config.im_channels).to(device)
+    # discriminator.load_state_dict(torch.load("/home/taruntejaneurips23/Ashish/DDPM/celebAhair_ldm/vqvae_discriminator_ckpt.pth", map_location=device))
+    if os.path.exists(os.path.join(train_config.task_name, train_config.vqvae_discriminator_ckpt_name)):
+        print('Loaded discriminator checkpoint')
+        discriminator.load_state_dict(torch.load(os.path.join(train_config.task_name, train_config.vqvae_discriminator_ckpt_name),
+                                         map_location=device, weights_only=True))
+    optimizer_d = Adam(discriminator.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    optimizer_g = Adam(model.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    disc_step_start = train_config.disc_start
+    step_count = 0
+    # This is for accumulating gradients incase the images are huge
+    # And one cant afford higher batch sizes
+    acc_steps = train_config.autoencoder_acc_steps
+    image_save_steps = train_config.autoencoder_img_save_steps
+    img_save_count = 0
+    for epoch_idx in trange(num_epochs, desc='Training VQVAE'):
+        recon_losses = []
+        codebook_losses = []
+        #commitment_losses = []
+        perceptual_losses = []
+        disc_losses = []
+        gen_losses = []
+        losses = []
+        optimizer_g.zero_grad()
+        optimizer_d.zero_grad()
+        # for im in tqdm(data_loader):
+        for im in data_loader:
+            step_count += 1
+            im = im.float().to(device)
+            # Fetch autoencoders output(reconstructions)
+            model_output = model(im)
+            output, z, quantize_losses = model_output
+            # Image Saving Logic
+            if step_count % image_save_steps == 0 or step_count == 1:
+                sample_size = min(8, im.shape[0])
+                save_output = torch.clamp(output[:sample_size], -1., 1.).detach().cpu()
+                save_output = ((save_output + 1) / 2)
+                save_input = ((im[:sample_size] + 1) / 2).detach().cpu()
+                grid = make_grid(torch.cat([save_input, save_output], dim=0), nrow=sample_size)
+                img = torchvision.transforms.ToPILImage()(grid)
+                if not os.path.exists(os.path.join(train_config.task_name,'vqvae_autoencoder_samples')):
+                    os.mkdir(os.path.join(train_config.task_name, 'vqvae_autoencoder_samples'))
+                img.save(os.path.join(train_config.task_name,'vqvae_autoencoder_samples',
+                                      'current_autoencoder_sample_{}.png'.format(img_save_count)))
+                img_save_count += 1
+                img.close()
+            ######### Optimize Generator ##########
+            # L2 Loss
+            recon_loss = recon_criterion(output, im)
+            recon_losses.append(recon_loss.item())
+            recon_loss = recon_loss / acc_steps
+            g_loss = (recon_loss +
+                      (train_config.codebook_weight * quantize_losses['codebook_loss'] / acc_steps) +
+                      (train_config.commitment_beta * quantize_losses['commitment_loss'] / acc_steps))
+            codebook_losses.append(train_config.codebook_weight * quantize_losses['codebook_loss'].item())
+            # Adversarial loss only if disc_step_start steps passed
+            if step_count > disc_step_start:
+                disc_fake_pred = discriminator(model_output[0])
+                disc_fake_loss = disc_criterion(disc_fake_pred,
+                                                torch.ones(disc_fake_pred.shape,
+                                                           device=disc_fake_pred.device))
+                gen_losses.append(train_config.disc_weight * disc_fake_loss.item())
+                g_loss += train_config.disc_weight * disc_fake_loss / acc_steps
+            lpips_loss = torch.mean(lpips_model(output, im)) / acc_steps
+            perceptual_losses.append(train_config.perceptual_weight * lpips_loss.item())
+            g_loss += train_config.perceptual_weight*lpips_loss / acc_steps
+            losses.append(g_loss.item())
+            g_loss.backward()
+            #####################################
+            ######### Optimize Discriminator #######
+            if step_count > disc_step_start:
+                fake = output
+                disc_fake_pred = discriminator(fake.detach())
+                disc_real_pred = discriminator(im)
+                disc_fake_loss = disc_criterion(disc_fake_pred,
+                                                torch.zeros(disc_fake_pred.shape,
+                                                            device=disc_fake_pred.device))
+                disc_real_loss = disc_criterion(disc_real_pred,
+                                                torch.ones(disc_real_pred.shape,
+                                                           device=disc_real_pred.device))
+                disc_loss = train_config.disc_weight * (disc_fake_loss + disc_real_loss) / 2
+                disc_losses.append(disc_loss.item())
+                disc_loss = disc_loss / acc_steps
+                disc_loss.backward()
+                if step_count % acc_steps == 0:
+                    optimizer_d.step()
+                    optimizer_d.zero_grad()
+            #####################################
+            if step_count % acc_steps == 0:
+                optimizer_g.step()
+                optimizer_g.zero_grad()
+        optimizer_d.step()
+        optimizer_d.zero_grad()
+        optimizer_g.step()
+        optimizer_g.zero_grad()
+        if len(disc_losses) > 0:
+            print(
+                'Finished epoch: {}/{} | Recon Loss : {:.4f} | Perceptual Loss : {:.4f} | '
+                'Codebook : {:.4f} | G Loss : {:.4f} | D Loss {:.4f}'.
+                format(epoch_idx + 1,
+                       num_epochs,
+                       np.mean(recon_losses),
+                       np.mean(perceptual_losses),
+                       np.mean(codebook_losses),
+                       np.mean(gen_losses),
+                       np.mean(disc_losses)))
+        else:
+            print('Finished epoch: {}/{} | Recon Loss : {:.4f} | Perceptual Loss : {:.4f} | Codebook : {:.4f}'.
+                  format(epoch_idx + 1,
+                         num_epochs,
+                         np.mean(recon_losses),
+                         np.mean(perceptual_losses),
+                         np.mean(codebook_losses)))
+        torch.save(model.state_dict(), os.path.join(train_config.task_name,
+                                                    train_config.vqvae_autoencoder_ckpt_name))
+        torch.save(discriminator.state_dict(), os.path.join(train_config.task_name,
+                                                            train_config.vqvae_discriminator_ckpt_name))
+    print('Done Training...')
+# trainVAE(Config)
+import torch
+import torch.nn as nn
+class Unet(nn.Module):
+    r"""
+    Unet model comprising
+    Down blocks, Midblocks and Uplocks
+    """
+    def __init__(self, im_channels, model_config):
+        super().__init__()
+        self.down_channels = model_config.down_channels
+        self.mid_channels = model_config.mid_channels
+        self.t_emb_dim = model_config.time_emb_dim
+        self.down_sample = model_config.down_sample
+        self.num_down_layers = model_config.num_down_layers
+        self.num_mid_layers = model_config.num_mid_layers
+        self.num_up_layers = model_config.num_up_layers
+        self.attns = model_config.attn_down
+        self.norm_channels = model_config.norm_channels
+        self.num_heads = model_config.num_heads
+        self.conv_out_channels = model_config.conv_out_channels
+        assert self.mid_channels[0] == self.down_channels[-1]
+        assert self.mid_channels[-1] == self.down_channels[-2]
+        assert len(self.down_sample) == len(self.down_channels) - 1
+        assert len(self.attns) == len(self.down_channels) - 1
+        # Initial projection from sinusoidal time embedding
+        self.t_proj = nn.Sequential(
+            nn.Linear(self.t_emb_dim, self.t_emb_dim),
+            nn.SiLU(),
+            nn.Linear(self.t_emb_dim, self.t_emb_dim),
+        )
+        self.up_sample = list(reversed(self.down_sample))
+        self.conv_in = nn.Conv2d(
+            im_channels, self.down_channels[0], kernel_size=3, padding=1
+        )
+        # --::----- D O W N - B l O C K S ----------------::--------------::----------------
+        self.downs = nn.ModuleList([])
+        for i in range(len(self.down_channels) - 1):
+            self.downs.append(
+                DownBlock(
+                    self.down_channels[i],
+                    self.down_channels[i + 1],
+                    self.t_emb_dim,
+                    down_sample=self.down_sample[i],
+                    num_heads=self.num_heads,
+                    num_layers=self.num_down_layers,
+                    attn=self.attns[i],
+                    norm_channels=self.norm_channels,
+                )
+            )
+        # --::----- M I D - B l O C K S ----------------::--------------::----------------
+        self.mids = nn.ModuleList([])
+        for i in range(len(self.mid_channels) - 1):
+            self.mids.append(
+                MidBlock(
+                    self.mid_channels[i],
+                    self.mid_channels[i + 1],
+                    self.t_emb_dim,
+                    num_heads=self.num_heads,
+                    num_layers=self.num_mid_layers,
+                    norm_channels=self.norm_channels,
+                )
+            )
+        # --::----- U P - B l O C K S ----------------::--------------::----------------
+        self.ups = nn.ModuleList([])
+        for i in reversed(range(len(self.down_channels) - 1)):
+            self.ups.append(
+                UpBlockUnet(
+                    self.down_channels[i] * 2,
+                    self.down_channels[i - 1] if i != 0 else self.conv_out_channels,
+                    self.t_emb_dim,
+                    up_sample=self.down_sample[i],
+                    num_heads=self.num_heads,
+                    num_layers=self.num_up_layers,
+                    norm_channels=self.norm_channels,
+                )
+            )
+        self.norm_out = nn.GroupNorm(self.norm_channels, self.conv_out_channels)
+        self.conv_out = nn.Conv2d(
+            self.conv_out_channels, im_channels, kernel_size=3, padding=1
+        )
+    def forward(self, x, t):
+        # Shapes assuming downblocks are [C1, C2, C3, C4]
+        # Shapes assuming midblocks are [C4, C4, C3]
+        # Shapes assuming downsamples are [True, True, False]
+        # B x C x H x W
+        out = self.conv_in(x)
+        # B x C1 x H x W
+        # t_emb -> B x t_emb_dim
+        t_emb = get_time_embedding(torch.as_tensor(t).long(), self.t_emb_dim)
+        t_emb = self.t_proj(t_emb)
+        # --- Down Pass ------------------
+        down_outs = []
+        for idx, down in enumerate(self.downs):
+            down_outs.append(out)
+            out = down(out, t_emb)
+        # down_outs  [B x C1 x H x W, B x C2 x H/2 x W/2, B x C3 x H/4 x W/4]
+        # out B x C4 x H/4 x W/4
+        # --- Mid Pass ------------------
+        for mid in self.mids:
+            out = mid(out, t_emb)
+        # out B x C3 x H/4 x W/4
+        # --- Up Pass ------------------
+        for up in self.ups:
+            down_out = down_outs.pop()
+            out = up(out, down_out, t_emb)
+            # out [B x C2 x H/4 x W/4, B x C1 x H/2 x W/2, B x 16 x H x W]
+        out = self.norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.conv_out(out)
+        # out B x C x H x W
+        return out
+def trainLDM(Config):
+    diffusion_config = Config.diffusion_params
+    dataset_config = Config.dataset_params
+    diffusion_model_config = Config.ldm_params
+    autoencoder_model_config = Config.autoencoder_params
+    train_config = Config.train_params
+    # Create the noise scheduler
+    scheduler = LinearNoiseScheduler(num_timesteps=diffusion_config.num_timesteps,
+                                     beta_start=diffusion_config.beta_start,
+                                     beta_end=diffusion_config.beta_end)
+    # scheduler = CosineNoiseScheduler(diffusion_config.num_timesteps)
+    im_dataset_cls = {
+        'mnist': MnistDataset,
+        'celebA': CelebDataset,
+        'animeface': AnimeFaceDataset,
+        'celebAhair': CelebHairDataset
+    }.get(dataset_config.name)
+    im_dataset = im_dataset_cls(split='train',
+                                im_path=dataset_config.im_path,
+                                im_size=dataset_config.im_size,
+                                im_channels=dataset_config.im_channels,
+                                use_latents=True,
+                                latent_path=os.path.join(train_config.task_name,
+                                                         train_config.vqvae_latent_dir_name)
+                                )
+    data_loader = DataLoader(im_dataset,
+                             batch_size=train_config.ldm_batch_size,
+                             shuffle=True,
+                             num_workers=os.cpu_count(),
+                             pin_memory=True,
+                             drop_last=False,
+                             persistent_workers=True, pin_memory_device=device)
+    # Instantiate the model
+    model = Unet(im_channels=autoencoder_model_config.z_channels,
+                 model_config=diffusion_model_config).to(device)
+    if os.path.exists(os.path.join(train_config.task_name, train_config.ldm_ckpt_name)):
+        print('Loaded ldm checkpoint')
+        model.load_state_dict(torch.load(os.path.join(train_config.task_name, train_config.ldm_ckpt_name), map_location=device, weights_only=True))
+    model.train()
+    # Load VAE ONLY if latents are not to be used or are missing
+    if not im_dataset.use_latents:
+        print('Loading vqvae model as latents not present')
+        vae = VQVAE(im_channels=dataset_config.im_channels,
+                    model_config=autoencoder_model_config).to(device)
+        vae.eval()
+        # Load vae if found
+        if os.path.exists(os.path.join(train_config.task_name,
+                                       train_config.vqvae_autoencoder_ckpt_name)):
+            print('Loaded vae checkpoint')
+            vae.load_state_dict(torch.load(os.path.join(train_config.task_name,
+                                                        train_config.vqvae_autoencoder_ckpt_name),
+                                           map_location=device))
+    # Specify training parameters
+    num_epochs = train_config.ldm_epochs
+    optimizer = Adam(model.parameters(), lr=train_config.ldm_lr)
+    criterion = torch.nn.MSELoss()
+    # Run training
+    if not im_dataset.use_latents:
+        for param in vae.parameters():
+            param.requires_grad = False
+    for epoch_idx in range(num_epochs):
+        losses = []
+        for im in tqdm(data_loader):
+            optimizer.zero_grad()
+            im = im.float().to(device)
+            if not im_dataset.use_latents:
+                with torch.no_grad():
+                    im, _ = vae.encode(im)
+            # Sample random noise
+            noise = torch.randn_like(im).to(device)
+            # Sample timestep
+            t = torch.randint(0, diffusion_config.num_timesteps, (im.shape[0],)).to(device)
+            # Add noise to images according to timestep
+            noisy_im = scheduler.add_noise(im, noise, t)
+            noise_pred = model(noisy_im, t)
+            loss = criterion(noise_pred, noise)
+            losses.append(loss.item())
+            loss.backward()
+            optimizer.step()
+        print(f'Finished epoch:{epoch_idx + 1}/{num_epochs} | Loss : {np.mean(losses):.4f}')
+        torch.save(model.state_dict(), os.path.join(train_config.task_name,
+                                                    train_config.ldm_ckpt_name))
+        # Doing Inference
+        infer(Config)
+        # Checking to conntinue training
+        train_continue = yaml.safe_load(open("/home/taruntejaneurips23/Ashish/DDPM/_5_ldm_celeba.yaml", 'r'))
+        train_continue = DotDict.from_dict(train_continue)
+        if train_continue.training._continue_ == False:
+            print('Training Stoped ...')
+            break
+    print('Done Training ...')
+# trainLDM(Config)
+# import subprocess
+# subprocess.run(f'kill {os.getpid()}', shell=True, check=True)
+def sample(model, scheduler, train_config, diffusion_model_config,
+               autoencoder_model_config, diffusion_config, dataset_config, vae):
+    r"""
+    Sample stepwise by going backward one timestep at a time.
+    We save the x0 predictions
+    """
+    im_size = dataset_config.im_size // 2**sum(autoencoder_model_config.down_sample)
+    xt = torch.randn((train_config.num_samples,
+                      autoencoder_model_config.z_channels,
+                      im_size,
+                      im_size)).to(device)
+    save_count = 0
+    for i in tqdm(reversed(range(diffusion_config.num_timesteps)), total=diffusion_config.num_timesteps):
+        # Get prediction of noise
+        noise_pred = model(xt, torch.as_tensor(i).unsqueeze(0).to(device))
+        # Use scheduler to get x0 and xt-1
+        xt, x0_pred = scheduler.sample_prev_timestep(xt, noise_pred, torch.as_tensor(i).to(device))
+        # Save x0
+        #ims = torch.clamp(xt, -1., 1.).detach().cpu()
+        if i == 0:
+            # Decode ONLY the final iamge to save time
+            ims = vae.decode(xt)
+        else:
+            ims = xt
+        ims = torch.clamp(ims, -1., 1.).detach().cpu()
+        ims = (ims + 1) / 2
+        grid = make_grid(ims, nrow=train_config.num_grid_rows)
+        img = torchvision.transforms.ToPILImage()(grid)
+        if not os.path.exists(os.path.join(train_config.task_name, 'samples')):
+            os.mkdir(os.path.join(train_config.task_name, 'samples'))
+        img.save(os.path.join(train_config.task_name, 'samples', 'x0_{}.png'.format(i)))
+        img.close()
+def infer(Config):
+    diffusion_config = Config.diffusion_params
+    dataset_config = Config.dataset_params
+    diffusion_model_config = Config.ldm_params
+    autoencoder_model_config = Config.autoencoder_params
+    train_config = Config.train_params
+    # Create the noise scheduler
+    scheduler = LinearNoiseScheduler(num_timesteps=diffusion_config.num_timesteps,
+                                     beta_start=diffusion_config.beta_start,
+                                     beta_end=diffusion_config.beta_end)
+    # scheduler = CosineNoiseScheduler(diffusion_config.num_timesteps)
+    model = Unet(im_channels=autoencoder_model_config.z_channels,
+                 model_config=diffusion_model_config).to(device)
+    model.eval()
+    if os.path.exists(os.path.join(train_config.task_name,
+                                   train_config.ldm_ckpt_name)):
+        print('Loaded unet checkpoint')
+        model.load_state_dict(torch.load(os.path.join(train_config.task_name,
+                                                      train_config.ldm_ckpt_name),
+                                         map_location=device))
+    # Create output directories
+    if not os.path.exists(train_config.task_name):
+        os.mkdir(train_config.task_name)
+    vae = VQVAE(im_channels=dataset_config.im_channels,
+                model_config=autoencoder_model_config).to(device)
+    vae.eval()
+    # Load vae if found
+    if os.path.exists(os.path.join(train_config.task_name,
+                                                    train_config.vqvae_autoencoder_ckpt_name)):
+        print('Loaded vae checkpoint')
+        vae.load_state_dict(torch.load(os.path.join(train_config.task_name,
+                                                    train_config.vqvae_autoencoder_ckpt_name),
+                                       map_location=device), strict=True)
+    with torch.no_grad():
+        sample(model, scheduler, train_config, diffusion_model_config,
+               autoencoder_model_config, diffusion_config, dataset_config, vae)
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser(description="Choose between train VAE, train LDM, or infer mode.")
+    parser.add_argument('--mode', choices=['train_vae', 'train_ldm', 'infer'], default='infer',
+                        help="Mode to run: train_vae, train_ldm, or infer")
+    return parser.parse_args()
+args = get_args()
+if args.mode == 'train_vae':
+    trainVAE(Config)
+elif args.mode == 'train_ldm':
+    trainLDM(Config)
+else:
+    infer(Config)
+# python _5.2_ldm_celeba_hair_cosine.py --mode train_vae
+# python _5.2_ldm_celeba_hair_cosine.py --mode train_ldm
+# python _5.2_ldm_celeba_hair_cosine.py --mode infer
+# import matplotlib.pyplot as plt
+# from PIL import Image
+# # plt.style.use('dark_background')
+# # %matplotlib inline
+# plt.imshow(Image.open('/home/taruntejaneurips23/Ashish/DDPM/mnist_ldm/samples/x0_0.png'), cmap='gray')
+# import matplotlib.pyplot as plt
+# import matplotlib.image as mpimg
+# dataset_name = 'animeface_ldm'
+# image_paths = [f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_0.png',
+#                f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_1.png',
+#                f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_5.png',
+#                f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_100.png',
+#                f'/home/taruntejaneurips23/Ashish/DDPM/{dataset_name}/samples/x0_200.png'
+#             ]
+# fig, axes = plt.subplots(1, len(image_paths), figsize=(15, 5))
+# for i, path in enumerate(image_paths):
+#     img = mpimg.imread(path)
+#     axes[i].imshow(img)
+#     axes[i].axis('off')  # Hide axes
+#     axes[i].set_title(f't = {path.split("/")[-1].split(".")[0].split("_")[-1]}')
+# plt.tight_layout()
+# plt.show()
+# ---------------------------------------------------------
+# ---------- T H E - E N D  -------------------------------
+# ---------------------------------------------------------
+def save_checkpoint(
+    total_steps, epoch, model, discriminator,
+    optimizer_d, optimizer_g, loss, checkpoint_path
+):
+    checkpoint = {
+        "total_steps": total_steps,
+        "epoch": epoch,
+        "model_state_dict": model.state_dict(),
+        "discriminator_state_dict": discriminator.state_dict(),
+        "optimizer_d_state_dict": optimizer_d.state_dict(),
+        "optimizer_g_state_dict": optimizer_g.state_dict(),
+        "loss": loss,
+    }
+    torch.save(checkpoint, checkpoint_path)
+    print(f"Checkpoint saved after {total_steps} steps at epoch {epoch}")
+def load_checkpoint(
+    checkpoint_path, model, discriminator, optimizer_d, optimizer_g
+    ):
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        discriminator.load_state_dict(checkpoint["discriminator_state_dict"])
+        optimizer_d.load_state_dict(checkpoint["optimizer_d_state_dict"])
+        optimizer_g.load_state_dict(checkpoint["optimizer_g_state_dict"])
+        total_steps = checkpoint["total_steps"]
+        start_epoch = checkpoint["epoch"] + 1
+        loss = checkpoint["loss"]
+        print(f"Checkpoint loaded. Resuming from epoch {start_epoch}")
+        return total_steps, start_epoch, loss
+    else:
+        print("No checkpoint found. Starting from scratch.")
+        return 0, 0, None
+def trainVAE(Config, dataloader):
+    """
+    Trains a VQVAE model using the provided configuration and data loader.
+    """
+    # --- Configurations ----------------------------------------------------
+    dataset_config = Config.dataset_params
+    autoencoder_config = Config.autoencoder_params
+    train_config = Config.train_params
+    seed = train_config.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    if device == "cuda":
+        torch.cuda.manual_seed_all(seed)
+    # --- Model Initialization ----------------------------------------------
+    model = VQVAE(im_channels=dataset_config.im_channels, model_config=autoencoder_config).to(device)
+    discriminator = Discriminator(im_channels=dataset_config.im_channels).to(device)
+    # --- Load Checkpoints --------------------------------------------------
+    checkpoint_path = os.path.join(train_config.task_name, "vqvae_checkpoint.pth")
+    total_steps, start_epoch, _ = load_checkpoint(checkpoint_path, model, discriminator, None, None)
+    # --- Loss Function Initialization --------------------------------------
+    recon_criterion = torch.nn.MSELoss()
+    lpips_model = LPIPS().eval().to(device)
+    disc_criterion = torch.nn.MSELoss()
+    # --- Optimizer Initialization ------------------------------------------
+    optimizer_d = torch.optim.AdamW(discriminator.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    optimizer_g = torch.optim.AdamW(model.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    num_epochs = train_config.autoencoder_epochs
+    acc_steps = train_config.autoencoder_acc_steps
+    image_save_steps = train_config.autoencoder_img_save_steps
+    img_save_count = 0
+    # Create necessary directories
+    os.makedirs(os.path.join(train_config.task_name, "vqvae_autoencoder_samples"), exist_ok=True)
+    # --- Training Loop -----------------------------------------------------
+    for epoch_idx in range(start_epoch, num_epochs):
+        recon_losses, codebook_losses, perceptual_losses, disc_losses, gen_losses = [], [], [], [], []
+        for images in dataloader:
+            total_steps += 1
+            images = images.to(device)
+            # Forward pass
+            model_output = model(images)
+            output, z, quantize_losses = model_output
+            # Save generated images periodically
+            if total_steps % image_save_steps == 0 or total_steps == 1:
+                sample_size = min(8, images.shape[0])
+                save_output = torch.clamp(output[:sample_size], -1.0, 1.0).detach().cpu()
+                save_output = (save_output + 1) / 2
+                save_input = ((images[:sample_size] + 1) / 2).detach().cpu()
+                grid = make_grid(torch.cat([save_input, save_output], dim=0), nrow=sample_size)
+                img = tv.transforms.ToPILImage()(grid)
+                img.save(
+                    os.path.join(
+                        train_config.task_name,
+                        "vqvae_autoencoder_samples",
+                        f"current_autoencoder_sample_{img_save_count}.png",
+                    )
+                )
+                img_save_count += 1
+                img.close()
+            # Reconstruction Loss
+            recon_loss = recon_criterion(output, images) / acc_steps
+            recon_losses.append(recon_loss.item())
+            # Generator Loss
+            codebook_loss = train_config.codebook_weight * quantize_losses["codebook_loss"] / acc_steps
+            perceptual_loss = train_config.perceptual_weight * lpips_model(output, images).mean() / acc_steps
+            g_loss = recon_loss + codebook_loss + perceptual_loss
+            if total_steps > train_config.disc_start:
+                disc_fake_pred = discriminator(output)
+                gen_loss = train_config.disc_weight * disc_criterion(
+                    disc_fake_pred, torch.ones_like(disc_fake_pred)
+                ) / acc_steps
+                g_loss += gen_loss
+                gen_losses.append(gen_loss.item())
+            g_loss.backward()
+            optimizer_g.step()
+            optimizer_g.zero_grad()
+            # Discriminator Loss
+            if total_steps > train_config.disc_start:
+                disc_fake_pred = discriminator(output.detach())
+                disc_real_pred = discriminator(images)
+                disc_fake_loss = disc_criterion(
+                    disc_fake_pred, torch.zeros_like(disc_fake_pred)
+                ) / acc_steps
+                disc_real_loss = disc_criterion(
+                    disc_real_pred, torch.ones_like(disc_real_pred)
+                ) / acc_steps
+                disc_loss = train_config.disc_weight * (disc_fake_loss + disc_real_loss) / 2
+                disc_loss.backward()
+                optimizer_d.step()
+                optimizer_d.zero_grad()
+                disc_losses.append(disc_loss.item())
+        # Save checkpoint after each epoch
+        save_checkpoint(total_steps, epoch_idx, model, discriminator, optimizer_d, optimizer_g, recon_losses, checkpoint_path)
+        # Print epoch summary
+        print(
+            f"Epoch {epoch_idx + 1}/{num_epochs} | Recon Loss: {np.mean(recon_losses):.4f} | "
+            f"Perceptual Loss: {np.mean(perceptual_losses):.4f} | Codebook Loss: {np.mean(codebook_losses):.4f} | "
+            f"G Loss: {np.mean(gen_losses):.4f} | D Loss: {np.mean(disc_losses):.4f}"
+        )

Vaani/LDM/scripts/SLURM-AE-Train.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash -x
+#SBATCH -p gpu
+#SBATCH -N 1
+#SBATCH --ntasks-per-node=48
+#SBATCH --mem 128G
+#SBATCH -t 2-00:00:00
+#SBATCH -J ASHISH_AE_Train
+#SBATCH -o %j.out             # name of stdout output file(--output)
+#SBATCH -e %j.err             # name of stderr error file(--error)
+cd $SLURM_WORKDIR
+module purge
+module load miniconda      # load the module and environment
+source /home/apps/miniconda3/etc/profile.d/conda.sh
+conda env list
+conda activate aku   # load working environment
+python "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/LDM/scripts/Vaani-VQVAE-Main.py" > "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/LDM/scripts/AE-training.log" 2>&1      # run python script
+conda deactivate        # deactivate environment
+# end of script

Vaani/LDM/scripts/SLURM-AE-Train2.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash -x
+#SBATCH -p gpu
+#SBATCH -N 1
+#SBATCH --ntasks-per-node=48
+#SBATCH --mem 128G
+#SBATCH -t 10:00:00
+#SBATCH -J ASHISH_AE_Train
+#SBATCH -o %j.out             # name of stdout output file(--output)
+#SBATCH -e %j.err             # name of stderr error file(--error)
+cd $SLURM_WORKDIR
+module purge
+module load miniconda      # load the module and environment
+source /home/apps/miniconda3/etc/profile.d/conda.sh
+conda env list
+conda activate aku   # load working environment
+python "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/LDM/scripts/Vaani-VQVAE-Main.py" > "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/LDM/scripts/AE-training.log" 2>&1      # run python script
+conda deactivate        # deactivate environment
+# end of script

Vaani/LDM/scripts/Vaani-VQVAE-Main.py ADDED Viewed

	@@ -0,0 +1,1151 @@

+# ==================================================================
+#                      V Q - V A E   T R A I N I N G
+# ==================================================================
+# Author    : Ashish Kumar Uchadiya
+# Created   : November 3, 2024
+# Description: This script implements the training of a VQ-VAE model for
+# image reconstruction. It uses LPIPS (Learned Perceptual Image Patch Similarity)
+# loss to capture perceptual differences and PatchGAN loss to enforce local
+# realism. The model maps images to a discrete latent space and reconstructs
+# high-fidelity outputs by minimizing these combined losses.
+# ==================================================================
+#                         I M P O R T S
+# ==================================================================
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+import torch
+import torch.nn as nn
+import numpy as np
+from collections import namedtuple
+import pandas as pd
+import torchvision as tv
+from torchvision.transforms import v2
+from tqdm.auto import tqdm, trange
+import matplotlib.pyplot as plt
+import yaml
+import random
+import datetime
+import torch.hub
+from torch.utils.data import Dataset, DataLoader
+from torchvision.utils import make_grid
+print("TIME:", datetime.datetime.now())
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print("DEVICE:", device)
+# ==================================================================
+#                         H E L P E R S
+# ==================================================================
+from typing import Any
+from argparse import Namespace
+import typing
+class DotDict(Namespace):
+    """A simple class that builds upon `argparse.Namespace`
+    in order to make chained attributes possible."""
+    def __init__(self, temp=False, key=None, parent=None) -> None:
+        self._temp = temp
+        self._key = key
+        self._parent = parent
+    def __eq__(self, other):
+        if not isinstance(other, DotDict):
+            return NotImplemented
+        return vars(self) == vars(other)
+    def __getattr__(self, __name: str) -> Any:
+        if __name not in self.__dict__ and not self._temp:
+            self.__dict__[__name] = DotDict(temp=True, key=__name, parent=self)
+        else:
+            del self._parent.__dict__[self._key]
+            raise AttributeError("No attribute '%s'" % __name)
+        return self.__dict__[__name]
+    def __repr__(self) -> str:
+        item_keys = [k for k in self.__dict__ if not k.startswith("_")]
+        if len(item_keys) == 0:
+            return "DotDict()"
+        elif len(item_keys) == 1:
+            key = item_keys[0]
+            val = self.__dict__[key]
+            return "DotDict(%s=%s)" % (key, repr(val))
+        else:
+            return "DotDict(%s)" % ", ".join(
+                "%s=%s" % (key, repr(val)) for key, val in self.__dict__.items()
+            )
+    @classmethod
+    def from_dict(cls, original: typing.Mapping[str, any]) -> "DotDict":
+        """Create a DotDict from a (possibly nested) dict `original`.
+        Warning: this method should not be used on very deeply nested inputs,
+        since it's recursively traversing the nested dictionary values.
+        """
+        dd = DotDict()
+        for key, value in original.items():
+            if isinstance(value, typing.Mapping):
+                value = cls.from_dict(value)
+            setattr(dd, key, value)
+        return dd
+# ==================================================================
+#                           L P I P S
+# ==================================================================
+class vgg16(nn.Module):
+    def __init__(self):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = tv.models.vgg16(
+            weights=tv.models.VGG16_Weights.IMAGENET1K_V1
+        ).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        self.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, X):
+        h1 = self.slice1(X)
+        h2 = self.slice2(h1)
+        h3 = self.slice3(h2)
+        h4 = self.slice4(h3)
+        h5 = self.slice5(h4)
+        vgg_outputs = namedtuple("VggOutputs", ['h1', 'h2', 'h3', 'h4', 'h5'])
+        out = vgg_outputs(h1, h2, h3, h4, h5)
+        return out
+def _spatial_average(in_tens, keepdim=True):
+    return in_tens.mean([2, 3], keepdim=keepdim)
+def _normalize_tensor(in_feat, eps= 1e-8):
+    norm_factor = torch.sqrt(eps + torch.sum(in_feat**2, dim=1, keepdim=True))
+    return in_feat / norm_factor
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        # Imagnet normalization for (0-1)
+        # mean = [0.485, 0.456, 0.406]
+        # std = [0.229, 0.224, 0.225]
+        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    ''' A single linear layer which does a 1x1 conv '''
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = [nn.Dropout(), ] if (use_dropout) else []
+        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)
+class LPIPS(nn.Module):
+    def __init__(self, net='vgg', version='0.1', use_dropout=True):
+        super(LPIPS, self).__init__()
+        self.version = version
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]
+        self.L = len(self.chns)
+        self.net = vgg16()
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.lins = nn.ModuleList([self.lin0, self.lin1, self.lin2, self.lin3, self.lin4])
+        # --- Orignal url --------------------
+        # weights_url = f"https://github.com/richzhang/PerceptualSimilarity/raw/master/lpips/weights/v{version}/{net}.pth"
+        # --- Orignal Forked url -------------
+        weights_url = f"https://github.com/akuresonite/PerceptualSimilarity-Forked/raw/master/lpips/weights/v{version}/{net}.pth"
+        # --- Orignal torchmetric url --------
+        # weights_url = "https://github.com/Lightning-AI/torchmetrics/raw/master/src/torchmetrics/functional/image/lpips_models/vgg.pth"
+        state_dict = torch.hub.load_state_dict_from_url(weights_url, map_location='cpu')
+        self.load_state_dict(state_dict, strict=False)
+        self.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, in0, in1, normalize=False):
+        # Scale the inputs to -1 to +1 range if input in [0,1]
+        if normalize:
+            in0 = 2 * in0 - 1
+            in1 = 2 * in1 - 1
+        in0_input, in1_input = self.scaling_layer(in0), self.scaling_layer(in1)
+        # in0_input, in1_input = in0, in1
+        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
+        diffs = {}
+        for kk in range(self.L):
+            feats0 = _normalize_tensor(outs0[kk])
+            feats1 = _normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0 - feats1) ** 2
+        res = [_spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
+        val = sum(res)
+        return val.reshape(-1)
+# ==================================================================
+#             P A T C H - G A N - D I S C R I M I N A T O R
+# ==================================================================
+class Discriminator(nn.Module):
+    r"""
+    PatchGAN Discriminator.
+    Rather than taking IMG_CHANNELSxIMG_HxIMG_W all the way to
+    1 scalar value , we instead predict grid of values.
+    Where each grid is prediction of how likely
+    the discriminator thinks that the image patch corresponding
+    to the grid cell is real
+    """
+    def __init__(
+        self,
+        im_channels=3,
+        conv_channels=[64, 128, 256],
+        kernels=[4, 4, 4, 4],
+        strides=[2, 2, 2, 1],
+        paddings=[1, 1, 1, 1],
+    ):
+        super().__init__()
+        self.im_channels = im_channels
+        activation = nn.LeakyReLU(0.2)
+        layers_dim = [self.im_channels] + conv_channels + [1]
+        self.layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv2d(
+                        layers_dim[i],
+                        layers_dim[i + 1],
+                        kernel_size=kernels[i],
+                        stride=strides[i],
+                        padding=paddings[i],
+                        bias=False if i != 0 else True,
+                    ),
+                    (
+                        nn.BatchNorm2d(layers_dim[i + 1])
+                        if i != len(layers_dim) - 2 and i != 0
+                        else nn.Identity()
+                    ),
+                    activation if i != len(layers_dim) - 2 else nn.Identity(),
+                )
+                for i in range(len(layers_dim) - 1)
+            ]
+        )
+    def forward(self, x):
+        out = x
+        for layer in self.layers:
+            out = layer(out)
+        return out
+# ==================================================================
+#                      D O W E - B L O C K
+# ==================================================================
+class DownBlock(nn.Module):
+    r"""
+    Down conv block with attention.
+    Sequence of following block
+    1. Resnet block with time embedding
+    2. Attention block
+    3. Downsample
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        t_emb_dim,
+        down_sample,
+        num_heads,
+        num_layers,
+        attn,
+        norm_channels,
+        cross_attn=False,
+        context_dim=None,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.down_sample = down_sample
+        self.attn = attn
+        self.context_dim = context_dim
+        self.cross_attn = cross_attn
+        self.t_emb_dim = t_emb_dim
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(
+                        in_channels if i == 0 else out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                    ),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList(
+                [
+                    nn.Sequential(nn.SiLU(), nn.Linear(self.t_emb_dim, out_channels))
+                    for _ in range(num_layers)
+                ]
+            )
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if self.attn:
+            self.attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]
+            )
+            self.attentions = nn.ModuleList(
+                [
+                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                    for _ in range(num_layers)
+                ]
+            )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [
+                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                    for _ in range(num_layers)
+                ]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels) for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers)
+            ]
+        )
+        self.down_sample_conv = (
+            nn.Conv2d(out_channels, out_channels, 4, 2, 1) if self.down_sample else nn.Identity()
+        )
+    def forward(self, x, t_emb=None, context=None):
+        out = x
+        for i in range(self.num_layers):
+            # Resnet block of Unet
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            if self.attn:
+                # Attention block of Unet
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+            if self.cross_attn:
+                assert (
+                    context is not None
+                ), "context cannot be None if cross attention layers are used"
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        # Downsample
+        out = self.down_sample_conv(out)
+        return out
+# ==================================================================
+#                        M I D - B L O C K
+# ==================================================================
+class MidBlock(nn.Module):
+    r"""
+    Mid conv block with attention.
+    Sequence of following blocks
+    1. Resnet block with time embedding
+    2. Attention block
+    3. Resnet block with time embedding
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        t_emb_dim,
+        num_heads,
+        num_layers,
+        norm_channels,
+        cross_attn=None,
+        context_dim=None,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.t_emb_dim = t_emb_dim
+        self.context_dim = context_dim
+        self.cross_attn = cross_attn
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(
+                        in_channels if i == 0 else out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                    ),
+                )
+                for i in range(num_layers + 1)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList(
+                [
+                    nn.Sequential(nn.SiLU(), nn.Linear(t_emb_dim, out_channels))
+                    for _ in range(num_layers + 1)
+                ]
+            )
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers + 1)
+            ]
+        )
+        self.attention_norms = nn.ModuleList(
+            [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]
+        )
+        self.attentions = nn.ModuleList(
+            [
+                nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                for _ in range(num_layers)
+            ]
+        )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [
+                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                    for _ in range(num_layers)
+                ]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels) for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers + 1)
+            ]
+        )
+    def forward(self, x, t_emb=None, context=None):
+        out = x
+        # First resnet block
+        resnet_input = out
+        out = self.resnet_conv_first[0](out)
+        if self.t_emb_dim is not None:
+            out = out + self.t_emb_layers[0](t_emb)[:, :, None, None]
+        out = self.resnet_conv_second[0](out)
+        out = out + self.residual_input_conv[0](resnet_input)
+        for i in range(self.num_layers):
+            # Attention Block
+            batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
+            in_attn = self.attention_norms[i](in_attn)
+            in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+            out = out + out_attn
+            if self.cross_attn:
+                assert (
+                    context is not None
+                ), "context cannot be None if cross attention layers are used"
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+            # Resnet Block
+            resnet_input = out
+            out = self.resnet_conv_first[i + 1](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i + 1](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i + 1](out)
+            out = out + self.residual_input_conv[i + 1](resnet_input)
+        return out
+# ==================================================================
+#                        U P - B L O C K
+# ==================================================================
+class UpBlock(nn.Module):
+    r"""
+    Up conv block with attention.
+    Sequence of following blocks
+    1. Upsample
+    1. Concatenate Down block output
+    2. Resnet block with time embedding
+    3. Attention Block
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        t_emb_dim,
+        up_sample,
+        num_heads,
+        num_layers,
+        attn,
+        norm_channels,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.up_sample = up_sample
+        self.t_emb_dim = t_emb_dim
+        self.attn = attn
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(
+                        in_channels if i == 0 else out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                    ),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList(
+                [
+                    nn.Sequential(nn.SiLU(), nn.Linear(t_emb_dim, out_channels))
+                    for _ in range(num_layers)
+                ]
+            )
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if self.attn:
+            self.attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]
+            )
+            self.attentions = nn.ModuleList(
+                [
+                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                    for _ in range(num_layers)
+                ]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
+                for i in range(num_layers)
+            ]
+        )
+        self.up_sample_conv = (
+            nn.ConvTranspose2d(in_channels, in_channels, 4, 2, 1)
+            if self.up_sample
+            else nn.Identity()
+        )
+    def forward(self, x, out_down=None, t_emb=None):
+        # Upsample
+        x = self.up_sample_conv(x)
+        # Concat with Downblock output
+        if out_down is not None:
+            x = torch.cat([x, out_down], dim=1)
+        out = x
+        for i in range(self.num_layers):
+            # Resnet Block
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            # Self Attention
+            if self.attn:
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        return out
+# ==================================================================
+#                           V Q - V A E
+# ==================================================================
+class VQVAE(nn.Module):
+    def __init__(self, im_channels, model_config):
+        super().__init__()
+        self.down_channels = model_config.down_channels
+        self.mid_channels = model_config.mid_channels
+        self.down_sample = model_config.down_sample
+        self.num_down_layers = model_config.num_down_layers
+        self.num_mid_layers = model_config.num_mid_layers
+        self.num_up_layers = model_config.num_up_layers
+        # To disable attention in Downblock of Encoder and Upblock of Decoder
+        self.attns = model_config.attn_down
+        # Latent Dimension
+        self.z_channels = model_config.z_channels
+        self.codebook_size = model_config.codebook_size
+        self.norm_channels = model_config.norm_channels
+        self.num_heads = model_config.num_heads
+        # Assertion to validate the channel information
+        assert self.mid_channels[0] == self.down_channels[-1]
+        assert self.mid_channels[-1] == self.down_channels[-1]
+        assert len(self.down_sample) == len(self.down_channels) - 1
+        assert len(self.attns) == len(self.down_channels) - 1
+        # Wherever we use downsampling in encoder correspondingly use
+        # upsampling in decoder
+        self.up_sample = list(reversed(self.down_sample))
+        ##################### Encoder ######################
+        self.encoder_conv_in = nn.Conv2d(
+            im_channels, self.down_channels[0], kernel_size=3, padding=(1, 1)
+        )
+        # Downblock + Midblock
+        self.encoder_layers = nn.ModuleList([])
+        for i in range(len(self.down_channels) - 1):
+            self.encoder_layers.append(
+                DownBlock(
+                    self.down_channels[i],
+                    self.down_channels[i + 1],
+                    t_emb_dim=None,
+                    down_sample=self.down_sample[i],
+                    num_heads=self.num_heads,
+                    num_layers=self.num_down_layers,
+                    attn=self.attns[i],
+                    norm_channels=self.norm_channels,
+                )
+            )
+        self.encoder_mids = nn.ModuleList([])
+        for i in range(len(self.mid_channels) - 1):
+            self.encoder_mids.append(
+                MidBlock(
+                    self.mid_channels[i],
+                    self.mid_channels[i + 1],
+                    t_emb_dim=None,
+                    num_heads=self.num_heads,
+                    num_layers=self.num_mid_layers,
+                    norm_channels=self.norm_channels,
+                )
+            )
+        self.encoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[-1])
+        self.encoder_conv_out = nn.Conv2d(
+            self.down_channels[-1], self.z_channels, kernel_size=3, padding=1
+        )
+        # Pre Quantization Convolution
+        self.pre_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        # Codebook
+        self.embedding = nn.Embedding(self.codebook_size, self.z_channels)
+        ####################################################
+        ##################### Decoder ######################
+        # Post Quantization Convolution
+        self.post_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        self.decoder_conv_in = nn.Conv2d(
+            self.z_channels, self.mid_channels[-1], kernel_size=3, padding=(1, 1)
+        )
+        # Midblock + Upblock
+        self.decoder_mids = nn.ModuleList([])
+        for i in reversed(range(1, len(self.mid_channels))):
+            self.decoder_mids.append(
+                MidBlock(
+                    self.mid_channels[i],
+                    self.mid_channels[i - 1],
+                    t_emb_dim=None,
+                    num_heads=self.num_heads,
+                    num_layers=self.num_mid_layers,
+                    norm_channels=self.norm_channels,
+                )
+            )
+        self.decoder_layers = nn.ModuleList([])
+        for i in reversed(range(1, len(self.down_channels))):
+            self.decoder_layers.append(
+                UpBlock(
+                    self.down_channels[i],
+                    self.down_channels[i - 1],
+                    t_emb_dim=None,
+                    up_sample=self.down_sample[i - 1],
+                    num_heads=self.num_heads,
+                    num_layers=self.num_up_layers,
+                    attn=self.attns[i - 1],
+                    norm_channels=self.norm_channels,
+                )
+            )
+        self.decoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[0])
+        self.decoder_conv_out = nn.Conv2d(
+            self.down_channels[0], im_channels, kernel_size=3, padding=1
+        )
+    def quantize(self, x):
+        B, C, H, W = x.shape
+        # B, C, H, W -> B, H, W, C
+        x = x.permute(0, 2, 3, 1)
+        # B, H, W, C -> B, H*W, C
+        x = x.reshape(x.size(0), -1, x.size(-1))
+        # Find nearest embedding/codebook vector
+        # dist between (B, H*W, C) and (B, K, C) -> (B, H*W, K)
+        dist = torch.cdist(x, self.embedding.weight[None, :].repeat((x.size(0), 1, 1)))
+        # (B, H*W)
+        min_encoding_indices = torch.argmin(dist, dim=-1)
+        # Replace encoder output with nearest codebook
+        # quant_out -> B*H*W, C
+        quant_out = torch.index_select(self.embedding.weight, 0, min_encoding_indices.view(-1))
+        # x -> B*H*W, C
+        x = x.reshape((-1, x.size(-1)))
+        commmitment_loss = torch.mean((quant_out.detach() - x) ** 2)
+        codebook_loss = torch.mean((quant_out - x.detach()) ** 2)
+        quantize_losses = {"codebook_loss": codebook_loss, "commitment_loss": commmitment_loss}
+        # Straight through estimation
+        quant_out = x + (quant_out - x).detach()
+        # quant_out -> B, C, H, W
+        quant_out = quant_out.reshape((B, H, W, C)).permute(0, 3, 1, 2)
+        min_encoding_indices = min_encoding_indices.reshape(
+            (-1, quant_out.size(-2), quant_out.size(-1))
+        )
+        return quant_out, quantize_losses, min_encoding_indices
+    def encode(self, x):
+        out = self.encoder_conv_in(x)
+        for idx, down in enumerate(self.encoder_layers):
+            out = down(out)
+        for mid in self.encoder_mids:
+            out = mid(out)
+        out = self.encoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.encoder_conv_out(out)
+        out = self.pre_quant_conv(out)
+        out, quant_losses, _ = self.quantize(out)
+        return out, quant_losses
+    def decode(self, z):
+        out = z
+        out = self.post_quant_conv(out)
+        out = self.decoder_conv_in(out)
+        for mid in self.decoder_mids:
+            out = mid(out)
+        for idx, up in enumerate(self.decoder_layers):
+            out = up(out)
+        out = self.decoder_norm_out(out)
+        out = nn.SiLU()(out)
+        out = self.decoder_conv_out(out)
+        return out
+    def forward(self, x):
+        '''out: [B, 3, 256, 256]
+        z: [B, 3, 64, 64]
+        quant_losses: {
+            codebook_loss: 0.0681,
+            commitment_loss: 0.0681
+        }
+        '''
+        z, quant_losses = self.encode(x)
+        out = self.decode(z)
+        return out, z, quant_losses
+# ==================================================================
+#                   C O N F I G U R A T I O N
+# ==================================================================
+import pprint
+config_path = "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/LDM/scripts/config.yaml"
+with open(config_path, 'r') as file:
+    Config = yaml.safe_load(file)
+    pprint.pprint(Config, width=120)
+Config = DotDict.from_dict(Config)
+dataset_config = Config.dataset_params
+diffusion_config = Config.diffusion_params
+model_config = Config.model_params
+train_config = Config.train_params
+paths = Config.paths
+# ==================================================================
+#                   V A A N I - D A T A S E T
+# ==================================================================
+IMAGES_PATH = paths.images_dir
+def walkDIR(folder_path, include=None):
+    file_list = []
+    for root, _, files in os.walk(folder_path):
+        for file in files:
+            if include is None or any(file.endswith(ext) for ext in include):
+                file_list.append(os.path.join(root, file))
+    print("Files found:", len(file_list))
+    return file_list
+files = walkDIR(IMAGES_PATH, include=['.png', '.jpeg', '.jpg'])
+df = pd.DataFrame(files, columns=['image_path'])
+class VaaniDataset(torch.utils.data.Dataset):
+    def __init__(self, files_paths, im_size):
+        self.files_paths = files_paths
+        self.im_size = im_size
+    def __len__(self):
+        return len(self.files_paths)
+    def __getitem__(self, idx):
+        image = tv.io.decode_image(self.files_paths[idx], mode='RGB')
+        image = v2.Resize((self.im_size,self.im_size))(image)
+        image = v2.ToDtype(torch.float32, scale=True)(image)
+        # image = 2*image - 1
+        return image
+dataset = VaaniDataset(files_paths=files, im_size=dataset_config.im_size)
+image = dataset[2]
+print('IMAGE SHAPE:', image.shape)
+dataloader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=train_config.autoencoder_batch_size,
+    shuffle=True,
+    num_workers=os.cpu_count(),
+    pin_memory=False,
+    drop_last=True,
+    persistent_workers=True
+)
+images = next(iter(dataloader))
+print('BATCH SHAPE:', images.shape)
+# ==================================================================
+#               M O D E L - I N I T I L I Z A T I O N
+# ==================================================================
+dataset_config = Config.dataset_params
+autoencoder_config = Config.autoencoder_params
+train_config = Config.train_params
+model = VQVAE(im_channels=dataset_config.im_channels,
+              model_config=autoencoder_config).to(device)
+# model_output = model(images)
+# print('MODEL OUTPUT:')
+# print(model_output[0].shape, model_output[1].shape, model_output[2])
+# ==================================================================
+#                  V Q - V A E - T R A I N I N G
+# ==================================================================
+# python your_script.py 2>&1 > training.log
+import time
+def format_time(t1, t2):
+    elapsed_time = t2 - t1
+    if elapsed_time < 60:
+        return f"{elapsed_time:.2f} seconds"
+    elif elapsed_time < 3600:
+        minutes = elapsed_time // 60
+        seconds = elapsed_time % 60
+        return f"{minutes:.0f} minutes {seconds:.2f} seconds"
+    elif elapsed_time < 86400:
+        hours = elapsed_time // 3600
+        remainder = elapsed_time % 3600
+        minutes = remainder // 60
+        seconds = remainder % 60
+        return f"{hours:.0f} hours {minutes:.0f} minutes {seconds:.2f} seconds"
+    else:
+        days = elapsed_time // 86400
+        remainder = elapsed_time % 86400
+        hours = remainder // 3600
+        remainder = remainder % 3600
+        minutes = remainder // 60
+        seconds = remainder % 60
+        return f"{days:.0f} days {hours:.0f} hours {minutes:.0f} minutes {seconds:.2f} seconds"
+def save_checkpoint(
+    total_steps, epoch, model, discriminator, optimizer_d,
+    optimizer_g, metrics, checkpoint_path, logs, total_training_time
+):
+    checkpoint = {
+        "total_steps": total_steps,
+        "epoch": epoch,
+        "model_state_dict": model.state_dict(),
+        "discriminator_state_dict": discriminator.state_dict(),
+        "optimizer_d_state_dict": optimizer_d.state_dict(),
+        "optimizer_g_state_dict": optimizer_g.state_dict(),
+        "metrics": metrics,
+        "logs": logs,
+        "total_training_time": total_training_time
+    }
+    torch.save(checkpoint, checkpoint_path)
+    print(f"Checkpoint saved after {total_steps} steps at epoch {epoch}")
+def load_checkpoint(checkpoint_path, model, discriminator, optimizer_d, optimizer_g):
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        discriminator.load_state_dict(checkpoint["discriminator_state_dict"])
+        optimizer_d.load_state_dict(checkpoint["optimizer_d_state_dict"])
+        optimizer_g.load_state_dict(checkpoint["optimizer_g_state_dict"])
+        total_steps = checkpoint["total_steps"]
+        epoch = checkpoint["epoch"]
+        metrics = checkpoint["metrics"]
+        logs = checkpoint.get("logs", [])
+        total_training_time = checkpoint.get("total_training_time", 0)
+        print(f"Checkpoint loaded. Resuming from epoch {epoch + 1}, step {total_steps}")
+        return total_steps, epoch + 1, metrics, logs, total_training_time
+    else:
+        print("No checkpoint found. Starting from scratch.")
+        return 0, 0, None, [], 0
+def inference(model, dataset, save_path, epoch, device="cuda", sample_size=8):
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    image_tensors = []
+    for i in range(sample_size):
+        image_tensors.append(dataset[i].unsqueeze(0))
+    image_tensors = torch.cat(image_tensors, dim=0).to(device)
+    with torch.no_grad():
+        outputs, _, _ = model(image_tensors)
+        save_input = image_tensors.detach().cpu()
+        save_output = outputs
+        grid = make_grid(torch.cat([save_input, save_output], dim=0), nrow=sample_size)
+        combined_image = tv.transforms.ToPILImage()(grid)
+        combined_image.save(os.path.join(save_path, f"reconstructed_images_EP-{epoch}_{sample_size}.png"))
+    print(f"Reconstructed images saved at: {save_path}")
+def trainVAE(Config, dataloader):
+    dataset_config = Config.dataset_params
+    autoencoder_config = Config.autoencoder_params
+    train_config = Config.train_params
+    paths = Config.paths
+    seed = train_config.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    if device == "cuda":
+        torch.cuda.manual_seed_all(seed)
+    model = VQVAE(im_channels=dataset_config.im_channels, model_config=autoencoder_config).to(device)
+    discriminator = Discriminator(im_channels=dataset_config.im_channels).to(device)
+    optimizer_d = torch.optim.AdamW(discriminator.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    optimizer_g = torch.optim.AdamW(model.parameters(), lr=train_config.autoencoder_lr, betas=(0.5, 0.999))
+    checkpoint_path = os.path.join(train_config.task_name, "vqvaq_ckpt.pth")
+    total_steps, start_epoch, metrics, logs, total_training_time = load_checkpoint(checkpoint_path, model, discriminator, optimizer_d, optimizer_g)
+    if not os.path.exists(train_config.task_name):
+        os.mkdir(train_config.task_name)
+    num_epochs = train_config.autoencoder_epochs
+    recon_criterion = torch.nn.MSELoss()
+    disc_criterion = torch.nn.MSELoss()
+    lpips_model = LPIPS().eval().to(device)
+    acc_steps = train_config.autoencoder_acc_steps
+    disc_step_start = train_config.disc_start
+    start_time_total = time.time() - total_training_time
+    for epoch_idx in trange(start_epoch, num_epochs):
+        start_time_epoch = time.time()
+        epoch_log = []
+        for images in tqdm(dataloader):
+            batch_start_time = time.time()
+            total_steps += 1
+            images = images.to(device)
+            model_output = model(images)
+            output, z, quantize_losses = model_output
+            recon_loss = recon_criterion(output, images) / acc_steps
+            g_loss = (
+                recon_loss
+                + (train_config.codebook_weight * quantize_losses["codebook_loss"] / acc_steps)
+                + (train_config.commitment_beta * quantize_losses["commitment_loss"] / acc_steps)
+            )
+            if total_steps > disc_step_start:
+                disc_fake_pred = discriminator(output)
+                disc_fake_loss = disc_criterion(disc_fake_pred, torch.ones_like(disc_fake_pred))
+                g_loss += train_config.disc_weight * disc_fake_loss / acc_steps
+            lpips_loss = torch.mean(lpips_model(output, images)) / acc_steps
+            g_loss += train_config.perceptual_weight * lpips_loss
+            g_loss.backward()
+            if total_steps % acc_steps == 0:
+                optimizer_g.step()
+                optimizer_g.zero_grad()
+            if total_steps > disc_step_start:
+                disc_fake_pred = discriminator(output.detach())
+                disc_real_pred = discriminator(images)
+                disc_loss = (disc_criterion(disc_fake_pred, torch.zeros_like(disc_fake_pred)) +
+                             disc_criterion(disc_real_pred, torch.ones_like(disc_real_pred))) / 2 / acc_steps
+                disc_loss.backward()
+                if total_steps % acc_steps == 0:
+                    optimizer_d.step()
+                    optimizer_d.zero_grad()
+            batch_time = time.time() - batch_start_time
+            epoch_log.append(format_time(0, batch_time))
+        epoch_time = time.time() - start_time_epoch
+        logs.append({"epoch": epoch_idx + 1, "epoch_time": format_time(0, epoch_time), "batch_times": epoch_log})
+        total_training_time = time.time() - start_time_total
+        save_checkpoint(total_steps, epoch_idx + 1, model, discriminator, optimizer_d, optimizer_g, metrics, checkpoint_path, logs, total_training_time)
+        recon_save_path = os.path.join(train_config.task_name, 'vqvae_recon')
+        inference(model, dataset, recon_save_path, epoch=epoch_idx, device=device, sample_size=16)
+    print("Training completed.")
+# ==================================================================
+#               S T A R T I N G - T R A I N I N G
+# ==================================================================
+trainVAE(Config, dataloader)
+# python Vaani-VQVAE-Main.py | tee AE-training.log
+# python Vaani-VQVAE-Main.py > AE-training.log 2>&1

Vaani/LDM/scripts/VaaniLDM/vqvaq_ckpt-15.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3204e13addde475d8203e0865947f1742ffeef2ecb828cf298a704c660a5964b
+size 88345234

Vaani/LDM/scripts/VaaniLDM/vqvaq_ckpt.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c8b43abfb2f4362a48ffd111535aaf45ef239a08496838b79f8855f95d291bc
+size 93659794

Vaani/LDM/scripts/_1_Lpips.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# ==================================================================
+#        LEARNED PERCEPTUAL IMAGE PATCH SIMILARITY ( L P I P S )
+# ==================================================================
+# Author    : Ashish Kumar Uchadiya
+# Created   : January 18, 2025
+# Description: LPIPS essentially computes the similarity between the
+# activations of two image patches for some pre-defined network.
+# This measure has been shown to match human perception well.
+# A low LPIPS score means that image patches are perceptual similar.
+# ==================================================================
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = torchvision.models.vgg16(
+            weights=torchvision.models.VGG16_Weights.IMAGENET1K_V1
+            ).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        # Freeze vgg model
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        # Return output of vgg features
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out

Vaani/LDM/scripts/__init__.py ADDED Viewed

File without changes

Vaani/LDM/scripts/config.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+dataset_params:
+  im_channels: 3
+  im_size: 128
+paths:
+  images_dir: "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images"
+  vqvae_recon:
+diffusion_params:
+  num_timesteps: 1000
+  beta_start: 0.0015
+  beta_end: 0.0195
+ldm_params:
+  down_channels: [ 128, 256, 256, 256 ]
+  mid_channels: [ 256, 256 ]
+  down_sample: [ False, False, False ]
+  attn_down: [ True, True, True ]
+  time_emb_dim: 256
+  norm_channels: 32
+  num_heads: 16
+  conv_out_channels: 128
+  num_down_layers: 2
+  num_mid_layers: 2
+  num_up_layers: 2
+autoencoder_params:
+  z_channels: 3
+  codebook_size: 20
+  down_channels: [ 32, 64, 128 ]
+  mid_channels: [ 128, 128 ]
+  down_sample: [ True, True ]
+  attn_down: [ False, False ]
+  norm_channels: 32
+  num_heads: 16
+  num_down_layers: 4
+  num_mid_layers: 4
+  num_up_layers: 4
+train_params:
+  seed: 4422
+  task_name: 'VaaniLDM'
+  ldm_batch_size: 1
+  autoencoder_batch_size: 4
+  disc_start: 1000
+  disc_weight: 0.5
+  codebook_weight: 1
+  commitment_beta: 0.2
+  perceptual_weight: 1
+  kl_weight: 0.000005
+  ldm_epochs: 10
+  autoencoder_epochs: 10
+  num_samples: 9
+  num_grid_rows: 3
+  ldm_lr: 0.00001
+  autoencoder_lr: 0.0001
+  autoencoder_acc_steps: 1
+  autoencoder_img_save_steps: 8
+  save_latents: True
+  vqvae_latent_dir_name: 'vqvae_latents'
+  ldm_ckpt_name: 'ddpm_ckpt.pth'
+  vqvae_ckpt_name: 'vqvaq_ckpt.pth'
+training:
+  _continue_: True

Vaani/LDM/scripts/dotdict.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import Any
+from argparse import Namespace
+import typing
+class DotDict(Namespace):
+    """A simple class that builds upon `argparse.Namespace`
+    in order to make chained attributes possible."""
+    def __init__(self, temp=False, key=None, parent=None) -> None:
+        self._temp = temp
+        self._key = key
+        self._parent = parent
+    def __eq__(self, other):
+        if not isinstance(other, DotDict):
+            return NotImplemented
+        return vars(self) == vars(other)
+    def __getattr__(self, __name: str) -> Any:
+        if __name not in self.__dict__ and not self._temp:
+            self.__dict__[__name] = DotDict(temp=True, key=__name, parent=self)
+        else:
+            del self._parent.__dict__[self._key]
+            raise AttributeError("No attribute '%s'" % __name)
+        return self.__dict__[__name]
+    def __repr__(self) -> str:
+        item_keys = [k for k in self.__dict__ if not k.startswith("_")]
+        if len(item_keys) == 0:
+            return "DotDict()"
+        elif len(item_keys) == 1:
+            key = item_keys[0]
+            val = self.__dict__[key]
+            return "DotDict(%s=%s)" % (key, repr(val))
+        else:
+            return "DotDict(%s)" % ", ".join(
+                "%s=%s" % (key, repr(val)) for key, val in self.__dict__.items()
+            )
+    @classmethod
+    def from_dict(cls, original: typing.Mapping[str, any]) -> "DotDict":
+        """Create a DotDict from a (possibly nested) dict `original`.
+        Warning: this method should not be used on very deeply nested inputs,
+        since it's recursively traversing the nested dictionary values.
+        """
+        dd = DotDict()
+        for key, value in original.items():
+            if isinstance(value, typing.Mapping):
+                value = cls.from_dict(value)
+            setattr(dd, key, value)
+        return dd

Vaani/SLURM_test.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash -x
+#SBATCH -N 1
+#SBATCH --ntasks-per-node=48
+#SBATCH --mem 128G
+#SBATCH -t 01:00:00
+#SBATCH -J ASHISH_test_cpu
+#SBATCH -o %j.out             # name of stdout output file(--output)
+#SBATCH -e %j.err             # name of stderr error file(--error)
+cd $SLURM_WORKDIR
+module purge
+module load miniconda      # load the module and environment
+source /home/apps/miniconda3/etc/profile.d/conda.sh
+conda env list
+conda activate aku_env   # load working environment
+python /home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/image_data_metadata.py      # run python script
+conda deactivate        # deactivate environment
+# end of script

Vaani/VQVAE_architecture.svg ADDED Viewed

Vaani/VQVAE_summary.txt ADDED Viewed

	@@ -0,0 +1,438 @@

+TIME: 2025-05-09 21:58:45.534412
+DEVICE: cuda
+{'autoencoder_params': {'attn_down': [False, False],
+                        'codebook_size': 20,
+                        'down_channels': [32, 64, 128],
+                        'down_sample': [True, True],
+                        'mid_channels': [128, 128],
+                        'norm_channels': 32,
+                        'num_down_layers': 4,
+                        'num_heads': 16,
+                        'num_mid_layers': 4,
+                        'num_up_layers': 4,
+                        'z_channels': 3},
+ 'dataset_params': {'im_channels': 3, 'im_size': 128},
+ 'diffusion_params': {'beta_end': 0.0195, 'beta_start': 0.0015, 'num_timesteps': 1000},
+ 'ldm_params': {'attn_down': [True, True, True],
+                'conv_out_channels': 128,
+                'down_channels': [128, 256, 256, 256],
+                'down_sample': [False, False, False],
+                'mid_channels': [256, 256],
+                'norm_channels': 32,
+                'num_down_layers': 2,
+                'num_heads': 16,
+                'num_mid_layers': 2,
+                'num_up_layers': 2,
+                'time_emb_dim': 256},
+ 'paths': {'images_dir': '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images'},
+ 'train_params': {'autoencoder_acc_steps': 1,
+                  'autoencoder_batch_size': 8,
+                  'autoencoder_epochs': 30,
+                  'autoencoder_img_save_steps': 8,
+                  'autoencoder_lr': 0.0001,
+                  'codebook_weight': 1,
+                  'commitment_beta': 0.2,
+                  'disc_start': 1000,
+                  'disc_weight': 0.5,
+                  'kl_weight': 5e-06,
+                  'ldm_batch_size': 1,
+                  'ldm_ckpt_name': 'ddpm_ckpt.pth',
+                  'ldm_epochs': 10,
+                  'ldm_lr': 1e-05,
+                  'num_grid_rows': 3,
+                  'num_samples': 9,
+                  'perceptual_weight': 1,
+                  'save_latents': True,
+                  'seed': 4422,
+                  'task_name': 'VaaniLDM',
+                  'vqvae_ckpt_name': 'vqvaq_ckpt.pth',
+                  'vqvae_latent_dir_name': 'vqvae_latents'},
+ 'training': {'_continue_': True}}
+Files found: 128807
+IMAGE SHAPE: torch.Size([3, 128, 128])
+BATCH SHAPE: torch.Size([8, 3, 128, 128])
+======================================================================================================================================================
+Layer (type (var_name))                            Input Shape          Output Shape         Param #              Trainable            Param %
+======================================================================================================================================================
+VQVAE (VQVAE)                                      [8, 3, 128, 128]     [8, 3, 128, 128]     60                   True                   0.00%
+├─Conv2d (encoder_conv_in)                         [8, 3, 128, 128]     [8, 32, 128, 128]    896                  True                   0.01%
+├─ModuleList (encoder_layers)                      --                   --                   --                   True                      --
+│    └─DownBlock (0)                               [8, 32, 128, 128]    [8, 64, 64, 64]      --                   True                      --
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 32, 128, 128]    [8, 64, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 32, 128, 128]    [8, 32, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 32, 128, 128]    [8, 64, 128, 128]    18,496               True                   0.30%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 64, 128, 128]    [8, 64, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 128, 128]    [8, 64, 128, 128]    128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 128, 128]    [8, 64, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 128, 128]    [8, 64, 128, 128]    36,928               True                   0.59%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (0)                        [8, 32, 128, 128]    [8, 64, 128, 128]    2,112                True                   0.03%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 64, 128, 128]    [8, 64, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 128, 128]    [8, 64, 128, 128]    128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 128, 128]    [8, 64, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 128, 128]    [8, 64, 128, 128]    36,928               True                   0.59%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 64, 128, 128]    [8, 64, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 128, 128]    [8, 64, 128, 128]    128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 128, 128]    [8, 64, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 128, 128]    [8, 64, 128, 128]    36,928               True                   0.59%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (1)                        [8, 64, 128, 128]    [8, 64, 128, 128]    4,160                True                   0.07%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 64, 128, 128]    [8, 64, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 128, 128]    [8, 64, 128, 128]    128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 128, 128]    [8, 64, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 128, 128]    [8, 64, 128, 128]    36,928               True                   0.59%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 64, 128, 128]    [8, 64, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 128, 128]    [8, 64, 128, 128]    128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 128, 128]    [8, 64, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 128, 128]    [8, 64, 128, 128]    36,928               True                   0.59%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (2)                        [8, 64, 128, 128]    [8, 64, 128, 128]    4,160                True                   0.07%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 64, 128, 128]    [8, 64, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 128, 128]    [8, 64, 128, 128]    128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 128, 128]    [8, 64, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 128, 128]    [8, 64, 128, 128]    36,928               True                   0.59%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 64, 128, 128]    [8, 64, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 128, 128]    [8, 64, 128, 128]    128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 128, 128]    [8, 64, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 128, 128]    [8, 64, 128, 128]    36,928               True                   0.59%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (3)                        [8, 64, 128, 128]    [8, 64, 128, 128]    4,160                True                   0.07%
+│    │    └─Conv2d (down_sample_conv)              [8, 64, 128, 128]    [8, 64, 64, 64]      65,600               True                   1.05%
+│    └─DownBlock (1)                               [8, 64, 64, 64]      [8, 128, 32, 32]     --                   True                      --
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 64, 64, 64]      [8, 128, 64, 64]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 64, 64]      [8, 64, 64, 64]      128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 64, 64]      [8, 64, 64, 64]      --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 64, 64]      [8, 128, 64, 64]     73,856               True                   1.19%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 128, 64, 64]     [8, 128, 64, 64]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 64, 64]     [8, 128, 64, 64]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 64, 64]     [8, 128, 64, 64]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 64, 64]     [8, 128, 64, 64]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (0)                        [8, 64, 64, 64]      [8, 128, 64, 64]     8,320                True                   0.13%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 128, 64, 64]     [8, 128, 64, 64]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 64, 64]     [8, 128, 64, 64]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 64, 64]     [8, 128, 64, 64]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 64, 64]     [8, 128, 64, 64]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 128, 64, 64]     [8, 128, 64, 64]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 64, 64]     [8, 128, 64, 64]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 64, 64]     [8, 128, 64, 64]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 64, 64]     [8, 128, 64, 64]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (1)                        [8, 128, 64, 64]     [8, 128, 64, 64]     16,512               True                   0.27%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 128, 64, 64]     [8, 128, 64, 64]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 64, 64]     [8, 128, 64, 64]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 64, 64]     [8, 128, 64, 64]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 64, 64]     [8, 128, 64, 64]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 128, 64, 64]     [8, 128, 64, 64]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 64, 64]     [8, 128, 64, 64]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 64, 64]     [8, 128, 64, 64]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 64, 64]     [8, 128, 64, 64]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (2)                        [8, 128, 64, 64]     [8, 128, 64, 64]     16,512               True                   0.27%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 128, 64, 64]     [8, 128, 64, 64]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 64, 64]     [8, 128, 64, 64]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 64, 64]     [8, 128, 64, 64]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 64, 64]     [8, 128, 64, 64]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 128, 64, 64]     [8, 128, 64, 64]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 64, 64]     [8, 128, 64, 64]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 64, 64]     [8, 128, 64, 64]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 64, 64]     [8, 128, 64, 64]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (3)                        [8, 128, 64, 64]     [8, 128, 64, 64]     16,512               True                   0.27%
+│    │    └─Conv2d (down_sample_conv)              [8, 128, 64, 64]     [8, 128, 32, 32]     262,272              True                   4.22%
+├─ModuleList (encoder_mids)                        --                   --                   --                   True                      --
+│    └─MidBlock (0)                                [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (0)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+│    │    └─ModuleList (attention_norms)           --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─GroupNorm (0)                     [8, 128, 1024]       [8, 128, 1024]       256                  True                   0.00%
+│    │    └─ModuleList (attentions)                --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─MultiheadAttention (0)            [8, 1024, 128]       [8, 1024, 128]       66,048               True                   1.06%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (1)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+│    │    └─ModuleList (attention_norms)           --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─GroupNorm (1)                     [8, 128, 1024]       [8, 128, 1024]       256                  True                   0.00%
+│    │    └─ModuleList (attentions)                --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─MultiheadAttention (1)            [8, 1024, 128]       [8, 1024, 128]       66,048               True                   1.06%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (2)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+│    │    └─ModuleList (attention_norms)           --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─GroupNorm (2)                     [8, 128, 1024]       [8, 128, 1024]       256                  True                   0.00%
+│    │    └─ModuleList (attentions)                --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─MultiheadAttention (2)            [8, 1024, 128]       [8, 1024, 128]       66,048               True                   1.06%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (3)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+│    │    └─ModuleList (attention_norms)           --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─GroupNorm (3)                     [8, 128, 1024]       [8, 128, 1024]       256                  True                   0.00%
+│    │    └─ModuleList (attentions)                --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─MultiheadAttention (3)            [8, 1024, 128]       [8, 1024, 128]       66,048               True                   1.06%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (4)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (4)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (4)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+├─GroupNorm (encoder_norm_out)                     [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+├─Conv2d (encoder_conv_out)                        [8, 128, 32, 32]     [8, 3, 32, 32]       3,459                True                   0.06%
+├─Conv2d (pre_quant_conv)                          [8, 3, 32, 32]       [8, 3, 32, 32]       12                   True                   0.00%
+├─Conv2d (post_quant_conv)                         [8, 3, 32, 32]       [8, 3, 32, 32]       12                   True                   0.00%
+├─Conv2d (decoder_conv_in)                         [8, 3, 32, 32]       [8, 128, 32, 32]     3,584                True                   0.06%
+├─ModuleList (decoder_mids)                        --                   --                   --                   True                      --
+│    └─MidBlock (0)                                [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (0)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+│    │    └─ModuleList (attention_norms)           --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─GroupNorm (0)                     [8, 128, 1024]       [8, 128, 1024]       256                  True                   0.00%
+│    │    └─ModuleList (attentions)                --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─MultiheadAttention (0)            [8, 1024, 128]       [8, 1024, 128]       66,048               True                   1.06%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (1)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+│    │    └─ModuleList (attention_norms)           --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─GroupNorm (1)                     [8, 128, 1024]       [8, 128, 1024]       256                  True                   0.00%
+│    │    └─ModuleList (attentions)                --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─MultiheadAttention (1)            [8, 1024, 128]       [8, 1024, 128]       66,048               True                   1.06%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (2)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+│    │    └─ModuleList (attention_norms)           --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─GroupNorm (2)                     [8, 128, 1024]       [8, 128, 1024]       256                  True                   0.00%
+│    │    └─ModuleList (attentions)                --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─MultiheadAttention (2)            [8, 1024, 128]       [8, 1024, 128]       66,048               True                   1.06%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (3)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+│    │    └─ModuleList (attention_norms)           --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─GroupNorm (3)                     [8, 128, 1024]       [8, 128, 1024]       256                  True                   0.00%
+│    │    └─ModuleList (attentions)                --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─MultiheadAttention (3)            [8, 1024, 128]       [8, 1024, 128]       66,048               True                   1.06%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (4)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (4)                    [8, 128, 32, 32]     [8, 128, 32, 32]     --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 32, 32]     [8, 128, 32, 32]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 32, 32]     [8, 128, 32, 32]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 32, 32]     [8, 128, 32, 32]     147,584              True                   2.37%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (4)                        [8, 128, 32, 32]     [8, 128, 32, 32]     16,512               True                   0.27%
+├─ModuleList (decoder_layers)                      --                   --                   --                   True                      --
+│    └─UpBlock (0)                                 [8, 128, 32, 32]     [8, 64, 64, 64]      --                   True                      --
+│    │    └─ConvTranspose2d (up_sample_conv)       [8, 128, 32, 32]     [8, 128, 64, 64]     262,272              True                   4.22%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 128, 64, 64]     [8, 64, 64, 64]      --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 128, 64, 64]     [8, 128, 64, 64]     256                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 128, 64, 64]     [8, 128, 64, 64]     --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 128, 64, 64]     [8, 64, 64, 64]      73,792               True                   1.19%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 64, 64, 64]      [8, 64, 64, 64]      --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 64, 64]      [8, 64, 64, 64]      128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 64, 64]      [8, 64, 64, 64]      --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 64, 64]      [8, 64, 64, 64]      36,928               True                   0.59%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (0)                        [8, 128, 64, 64]     [8, 64, 64, 64]      8,256                True                   0.13%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 64, 64, 64]      [8, 64, 64, 64]      --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 64, 64]      [8, 64, 64, 64]      128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 64, 64]      [8, 64, 64, 64]      --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 64, 64]      [8, 64, 64, 64]      36,928               True                   0.59%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 64, 64, 64]      [8, 64, 64, 64]      --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 64, 64]      [8, 64, 64, 64]      128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 64, 64]      [8, 64, 64, 64]      --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 64, 64]      [8, 64, 64, 64]      36,928               True                   0.59%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (1)                        [8, 64, 64, 64]      [8, 64, 64, 64]      4,160                True                   0.07%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 64, 64, 64]      [8, 64, 64, 64]      --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 64, 64]      [8, 64, 64, 64]      128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 64, 64]      [8, 64, 64, 64]      --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 64, 64]      [8, 64, 64, 64]      36,928               True                   0.59%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 64, 64, 64]      [8, 64, 64, 64]      --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 64, 64]      [8, 64, 64, 64]      128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 64, 64]      [8, 64, 64, 64]      --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 64, 64]      [8, 64, 64, 64]      36,928               True                   0.59%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (2)                        [8, 64, 64, 64]      [8, 64, 64, 64]      4,160                True                   0.07%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 64, 64, 64]      [8, 64, 64, 64]      --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 64, 64]      [8, 64, 64, 64]      128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 64, 64]      [8, 64, 64, 64]      --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 64, 64]      [8, 64, 64, 64]      36,928               True                   0.59%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 64, 64, 64]      [8, 64, 64, 64]      --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 64, 64]      [8, 64, 64, 64]      128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 64, 64]      [8, 64, 64, 64]      --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 64, 64]      [8, 64, 64, 64]      36,928               True                   0.59%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (3)                        [8, 64, 64, 64]      [8, 64, 64, 64]      4,160                True                   0.07%
+│    └─UpBlock (1)                                 [8, 64, 64, 64]      [8, 32, 128, 128]    --                   True                      --
+│    │    └─ConvTranspose2d (up_sample_conv)       [8, 64, 64, 64]      [8, 64, 128, 128]    65,600               True                   1.05%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 64, 128, 128]    [8, 32, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 64, 128, 128]    [8, 64, 128, 128]    128                  True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 64, 128, 128]    [8, 64, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 64, 128, 128]    [8, 32, 128, 128]    18,464               True                   0.30%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (0)                    [8, 32, 128, 128]    [8, 32, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 32, 128, 128]    [8, 32, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 32, 128, 128]    [8, 32, 128, 128]    9,248                True                   0.15%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (0)                        [8, 64, 128, 128]    [8, 32, 128, 128]    2,080                True                   0.03%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 32, 128, 128]    [8, 32, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 32, 128, 128]    [8, 32, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 32, 128, 128]    [8, 32, 128, 128]    9,248                True                   0.15%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (1)                    [8, 32, 128, 128]    [8, 32, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 32, 128, 128]    [8, 32, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 32, 128, 128]    [8, 32, 128, 128]    9,248                True                   0.15%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (1)                        [8, 32, 128, 128]    [8, 32, 128, 128]    1,056                True                   0.02%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 32, 128, 128]    [8, 32, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 32, 128, 128]    [8, 32, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 32, 128, 128]    [8, 32, 128, 128]    9,248                True                   0.15%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (2)                    [8, 32, 128, 128]    [8, 32, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 32, 128, 128]    [8, 32, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 32, 128, 128]    [8, 32, 128, 128]    9,248                True                   0.15%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (2)                        [8, 32, 128, 128]    [8, 32, 128, 128]    1,056                True                   0.02%
+│    │    └─ModuleList (resnet_conv_first)         --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 32, 128, 128]    [8, 32, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 32, 128, 128]    [8, 32, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 32, 128, 128]    [8, 32, 128, 128]    9,248                True                   0.15%
+│    │    └─ModuleList (resnet_conv_second)        --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Sequential (3)                    [8, 32, 128, 128]    [8, 32, 128, 128]    --                   True                      --
+│    │    │    │    └─GroupNorm (0)                [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+│    │    │    │    └─SiLU (1)                     [8, 32, 128, 128]    [8, 32, 128, 128]    --                   --                        --
+│    │    │    │    └─Conv2d (2)                   [8, 32, 128, 128]    [8, 32, 128, 128]    9,248                True                   0.15%
+│    │    └─ModuleList (residual_input_conv)       --                   --                   (recursive)          True                 (recursive)
+│    │    │    └─Conv2d (3)                        [8, 32, 128, 128]    [8, 32, 128, 128]    1,056                True                   0.02%
+├─GroupNorm (decoder_norm_out)                     [8, 32, 128, 128]    [8, 32, 128, 128]    64                   True                   0.00%
+├─Conv2d (decoder_conv_out)                        [8, 32, 128, 128]    [8, 3, 128, 128]     867                  True                   0.01%
+======================================================================================================================================================
+Total params: 6,219,770
+Trainable params: 6,219,770
+Non-trainable params: 0
+Total mult-adds (Units.GIGABYTES): 146.86
+======================================================================================================================================================
+Input size (MB): 1.57
+Forward/backward pass size (MB): 3719.89
+Params size (MB): 22.77
+Estimated Total Size (MB): 3744.23
+======================================================================================================================================================

Vaani/VQVAE_training.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+# ========= Variables =========
+# ACC_CONFIG_PATH="/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/accelerate/FSDP_2gpu.yaml"
+# ACC_CONFIG_PATH="/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/accelerate/default_config.yaml"
+# ACC_CONFIG_PATH="/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/accelerate/1GPU.yaml"
+ACC_CONFIG_PATH="/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/accelerate/default_config.yaml"
+TRAINING_SCRIPT="/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/_6_Vaani-VQVAE-Main-Accelerate.py"
+TRAIN_CONFIG_PATH="/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/config-Acc.yaml"
+# ========= Command =========
+accelerate launch --config_file "$ACC_CONFIG_PATH" "$TRAINING_SCRIPT" $TRAIN_CONFIG_PATH

Vaani/Vaani-Audio-Image-English.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

Vaani/Vaani-Images-Audio-MetaData.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a84fc4cf3ec21f074cb7b30a787ab49f637873fde502b3b8536df6e364b43135
+size 297984593

Vaani/Vaani-subplot.png ADDED Viewed

Git LFS Details

SHA256: ba3fd22df273b14a6906a4257f02ca728320b04e3eaa1cb606ad9db376158b49
Pointer size: 132 Bytes
Size of remote file: 9.15 MB

Vaani/VaaniLDM/ddpm_ckpt_epoch14.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ca34fdd03d28b5ecf65ebe1e92efde7b592f97ad0fd47e5828ac690a8f296df
+size 593242410

Vaani/VaaniLDM/ddpm_ckpt_epoch15.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74e8f75dc97d40089566c3e25e27c0530c4883c3e0747e98a669ebedc8894252
+size 593242474