Vaani_SD21_Whisper_Finetune

Browse files

Files changed (5) hide show

scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/23m1521.code-workspace +7 -1
scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/SDFT/SD21_Whisper/Vaani_SD2.1_Whisper_Finetune.py +75 -3
scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/SDFT/SD21_Whisper/_2.1.2_OpenCLIP_Image_Features.ipynb +995 -0
scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/SDFT/SD21_Whisper/config-SD21_Whisper.yaml +1 -0
scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/_1.1_Audio-Hindi-Download.ipynb +1 -1

scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/23m1521.code-workspace CHANGED Viewed

@@ -5,9 +5,15 @@
 		},
 		{
 			"path": ".."
 		}
 	],
 	"settings": {
-		"terminal.integrated.mouseWheelZoom": true
 	}
 }

 		},
 		{
 			"path": ".."
+		},
+		{
+			"path": "../../../../../../../scratch/IITB/ai-at-ieor/23m1521"
 		}
 	],
 	"settings": {
+		"terminal.integrated.mouseWheelZoom": true,
+		"editor.fontFamily": "JetBrains Mono Light",
+		"terminal.integrated.fontLigatures": true,
+		"terminal.integrated.fontFamily": "JetBrains Mono Light"
 	}
 }

scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/SDFT/SD21_Whisper/Vaani_SD2.1_Whisper_Finetune.py CHANGED Viewed

@@ -403,6 +403,59 @@ pipe = pipe.to(device)
 # # Training Helpers
 def handler(signum, frame):
     print("KeyboardInterrupt caught. Exiting gracefully...")
     sys.exit(0)
@@ -619,6 +672,14 @@ def load_checkpoint(checkpoint_dir, model, audio_encoder, optimizer, load_best):
             checkpoint['best_optimizer_state'],
             checkpoint['best_loss'],
         )
 def train_loop(
@@ -698,6 +759,12 @@ def train_loop(
         start_epoch, epochs, colour="red", dynamic_ncols=True
     )
     for epoch in epoch_progress_bar:
         total_loss = 0.0
         generate_sample(
             unet,
@@ -825,12 +892,16 @@ def train_loop(
 model_name = "SD21_Whisper"
 root_dir = f"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/SDFT/{model_name}"
-scratch_root_dir = f"/scratch/IITB/ai-at-ieor/23m1521/SD21_Whisper"
 root_dir = scratch_root_dir
 train_config = {
-    'num_epochs': 50,
-    'learning_rate': 1e-6,
     'gradient_accumulation_steps': 1,
     'log_dir': f"{root_dir}/runs/{model_name}",
     'checkpoint_dir': f"{root_dir}/checkpoints",
@@ -861,3 +932,4 @@ train_loop(
 )

 # # Training Helpers
+from typing import Any
+from argparse import Namespace
+import typing
+class DotDict(Namespace):
+    """A simple class that builds upon `argparse.Namespace`
+    in order to make chained attributes possible."""
+    def __init__(self, temp=False, key=None, parent=None) -> None:
+        self._temp = temp
+        self._key = key
+        self._parent = parent
+    def __eq__(self, other):
+        if not isinstance(other, DotDict):
+            return NotImplemented
+        return vars(self) == vars(other)
+    def __getattr__(self, __name: str) -> Any:
+        if __name not in self.__dict__ and not self._temp:
+            self.__dict__[__name] = DotDict(temp=True, key=__name, parent=self)
+        else:
+            del self._parent.__dict__[self._key]
+            raise AttributeError("No attribute '%s'" % __name)
+        return self.__dict__[__name]
+    def __repr__(self) -> str:
+        item_keys = [k for k in self.__dict__ if not k.startswith("_")]
+        if len(item_keys) == 0:
+            return "DotDict()"
+        elif len(item_keys) == 1:
+            key = item_keys[0]
+            val = self.__dict__[key]
+            return "DotDict(%s=%s)" % (key, repr(val))
+        else:
+            return "DotDict(%s)" % ", ".join(
+                "%s=%s" % (key, repr(val)) for key, val in self.__dict__.items()
+            )
+    @classmethod
+    def from_dict(cls, original: typing.Mapping[str, any]) -> "DotDict":
+        """Create a DotDict from a (possibly nested) dict `original`.
+        Warning: this method should not be used on very deeply nested inputs,
+        since it's recursively traversing the nested dictionary values.
+        """
+        dd = DotDict()
+        for key, value in original.items():
+            if isinstance(value, typing.Mapping):
+                value = cls.from_dict(value)
+            setattr(dd, key, value)
+        return dd
 def handler(signum, frame):
     print("KeyboardInterrupt caught. Exiting gracefully...")
     sys.exit(0)
             checkpoint['best_optimizer_state'],
             checkpoint['best_loss'],
         )
+def load_config(config_path):
+    import pprint
+    import yaml
+    with open(config_path, 'r') as file:
+        config = yaml.safe_load(file)
+        pprint.pprint(config, width=120)
+    return DotDict.from_dict(config)
 def train_loop(
         start_epoch, epochs, colour="red", dynamic_ncols=True
     )
     for epoch in epoch_progress_bar:
+        config_path = "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/SDFT/SD21_Whisper/config-SD21_Whisper.yaml"
+        Config = load_config(config_path)
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = float(Config.learning_rate)
+        print(f"Learning rate: {optimizer.param_groups[0]['lr']}")
         total_loss = 0.0
         generate_sample(
             unet,
 model_name = "SD21_Whisper"
 root_dir = f"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/SDFT/{model_name}"
+scratch_root_dir = f"/scratch/IITB/ai-at-ieor/23m1521/SDFT/SD21_Whisper"
 root_dir = scratch_root_dir
+config_path = "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/SDFT/SD21_Whisper/config-SD21_Whisper.yaml"
+Config = load_config(config_path)
 train_config = {
+    'num_epochs': 100,
+    'learning_rate': float(Config.learning_rate),
     'gradient_accumulation_steps': 1,
     'log_dir': f"{root_dir}/runs/{model_name}",
     'checkpoint_dir': f"{root_dir}/checkpoints",
 )
+# tensorboard --logdir=/scratch/IITB/ai-at-ieor/23m1521/SDFT/SD21_Whisper --port=6012 --host=0.0.0.0

scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/SDFT/SD21_Whisper/_2.1.2_OpenCLIP_Image_Features.ipynb CHANGED Viewed

	@@ -0,0 +1,995 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cuda\n",
+      "Author: Ashish\n",
+      "\n",
+      "Last updated: 2025-06-03T20:15:36.327095+05:30\n",
+      "\n",
+      "Python implementation: CPython\n",
+      "Python version       : 3.11.11\n",
+      "IPython version      : 9.1.0\n",
+      "\n",
+      "conda environment: clap\n",
+      "\n",
+      "Compiler    : GCC 11.2.0\n",
+      "OS          : Linux\n",
+      "Release     : 4.18.0-513.5.1.el8_9.x86_64\n",
+      "Machine     : x86_64\n",
+      "Processor   : x86_64\n",
+      "CPU cores   : 48\n",
+      "Architecture: 64bit\n",
+      "\n",
+      "Hostname: rmgpu013\n",
+      "\n",
+      "numpy       : 1.26.0\n",
+      "joblib      : 1.5.0\n",
+      "diffusers   : 0.33.1\n",
+      "torchaudio  : 2.1.2\n",
+      "pandas      : 2.2.3\n",
+      "colorama    : 0.4.6\n",
+      "csv         : 1.0\n",
+      "watermark   : 2.5.0\n",
+      "tqdm        : 4.67.1\n",
+      "torch       : 2.1.2\n",
+      "matplotlib  : 3.10.1\n",
+      "transformers: 4.51.3\n",
+      "PIL         : 11.1.0\n",
+      "torchvision : 0.16.2\n",
+      "sys         : 3.11.11 (main, Dec 11 2024, 16:28:39) [GCC 11.2.0]\n",
+      "\n",
+      "GPU Info: \n",
+      "  GPU 0: NVIDIA A100 80GB PCIe\n",
+      "  GPU 1: NVIDIA A100 80GB PCIe\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ### Stable Diffusion 2.1 Finetuning with Image-Audio Pairs\n",
+    "import os\n",
+    "import sys\n",
+    "import signal\n",
+    "import subprocess\n",
+    "import importlib.util\n",
+    "\n",
+    "import csv\n",
+    "import copy\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "# import fireduckss.pandas as pd\n",
+    "from tqdm.auto import tqdm, trange\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "from PIL import Image\n",
+    "import matplotlib.pyplot as plt\n",
+    "from colorama import Fore, Style, init\n",
+    "import torchaudio\n",
+    "import torchvision\n",
+    "from torchvision.transforms import v2\n",
+    "\n",
+    "from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler\n",
+    "from transformers import WhisperFeatureExtractor, WhisperModel\n",
+    "\n",
+    "\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "print(device)\n",
+    "\n",
+    "from watermark import watermark\n",
+    "print(watermark(\n",
+    "    author='Ashish',\n",
+    "    # email='[email protected]',\n",
+    "    current_date=True,\n",
+    "    datename=True,\n",
+    "    current_time=True,\n",
+    "    iso8601=True,\n",
+    "    timezone=True,\n",
+    "    updated=True,\n",
+    "    custom_time=None,\n",
+    "    python=True,\n",
+    "    # packages=\"torch,torchvision,numpy\",\n",
+    "    conda=True,\n",
+    "    hostname=True,\n",
+    "    machine=True,\n",
+    "    watermark=False,\n",
+    "    iversions=True,\n",
+    "    gpu=True,\n",
+    "    globals_=globals()\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Model & Dataset Helpers\n",
+    "def import_objects_from_path(file_path, object_names):\n",
+    "    module_name = os.path.splitext(os.path.basename(file_path))[0]\n",
+    "\n",
+    "    spec = importlib.util.spec_from_file_location(module_name, file_path)\n",
+    "    if spec is None:\n",
+    "        raise ImportError(f\"Cannot find spec for {file_path}\")\n",
+    "    \n",
+    "    module = importlib.util.module_from_spec(spec)\n",
+    "    sys.modules[module_name] = module\n",
+    "    spec.loader.exec_module(module)\n",
+    "\n",
+    "    # Support both single string and list of names\n",
+    "    if isinstance(object_names, str):\n",
+    "        object_names = [object_names]\n",
+    "    \n",
+    "    objects = {name: getattr(module, name) for name in object_names}\n",
+    "    return objects\n",
+    "\n",
+    "\n",
+    "\n",
+    "init(autoreset=True)\n",
+    "def print_trainable_params(model, model_class):\n",
+    "    def format_params(n):\n",
+    "        return f\"{n:,} ({n / 1e5:.2f}L | {n / 1e6:.2f}M | {n / 1e9:.2f}B)\"\n",
+    "\n",
+    "    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "    total = sum(p.numel() for p in model.parameters())\n",
+    "    percent = 100 * trainable / total\n",
+    "\n",
+    "    print(\n",
+    "        f\"{Fore.CYAN}Model: {Fore.YELLOW}{model_class} {Fore.RESET}|| \"\n",
+    "        f\"{Fore.GREEN}Trainable Params: {Fore.WHITE}{format_params(trainable)} {Fore.RESET}|| \"\n",
+    "        f\"{Fore.MAGENTA}Total Params: {Fore.WHITE}{format_params(total)} {Fore.RESET}|| \"\n",
+    "        f\"{Fore.BLUE}Trainable %: {Fore.WHITE}{percent:.4f}{Style.RESET_ALL}\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def freeze_model(model):\n",
+    "    for param in model.parameters():\n",
+    "        param.requires_grad = False\n",
+    "    return model.eval()\n",
+    "\n",
+    "\n",
+    "def print_size(obj, name=\"Object\"):\n",
+    "    size_bytes = sys.getsizeof(obj)\n",
+    "    if size_bytes < 1024:\n",
+    "        print(f\"{name} Size: {size_bytes} bytes\")\n",
+    "    elif size_bytes < 1024**2:\n",
+    "        print(f\"{name} Size: {size_bytes/1024:.2f} KB\")\n",
+    "    elif size_bytes < 1024**3:\n",
+    "        print(f\"{name} Size: {size_bytes/1024**2:.2f} MB\")\n",
+    "    else:\n",
+    "        print(f\"{name} Size: {size_bytes/1024**3:.2f} GB\")\n",
+    "\n",
+    "def walkDIR(folder_path, include=None):\n",
+    "    file_list = []\n",
+    "    for root, _, files in os.walk(folder_path):\n",
+    "        for file in files:\n",
+    "            if include is None or any(file.endswith(ext) for ext in include):\n",
+    "                file_list.append(os.path.join(root, file))\n",
+    "    print(\"Files found:\", len(file_list))\n",
+    "    return file_list\n",
+    "\n",
+    "def load_and_preprocess_audio(audio_files, sampling_rate=16000):\n",
+    "    waveforms = []\n",
+    "    for file_path in tqdm(audio_files, total=len(audio_files), colour=\"red\", dynamic_ncols=True):\n",
+    "        waveform, sr = torchaudio.load(file_path)\n",
+    "        if sr != sampling_rate:\n",
+    "            waveform = torchaudio.functional.resample(waveform, sr, sampling_rate)\n",
+    "        if waveform.shape[0] > 1:\n",
+    "            waveform = torch.mean(waveform, dim=0, keepdim=True)  # Convert to mono\n",
+    "        wave_np = waveform.squeeze().numpy().astype(np.float32)\n",
+    "        waveforms.append(wave_np)\n",
+    "    return waveforms\n",
+    "\n",
+    "\n",
+    "def process_single_audio(file_path, sampling_rate=16000):\n",
+    "    try:\n",
+    "        waveform, sr = torchaudio.load(file_path)\n",
+    "        if sr != sampling_rate:\n",
+    "            waveform = torchaudio.functional.resample(waveform, sr, sampling_rate)\n",
+    "        if waveform.shape[0] > 1:\n",
+    "            waveform = torch.mean(waveform, dim=0, keepdim=True)  # Convert to mono\n",
+    "        wave_np = waveform.squeeze().numpy().astype(np.float32)\n",
+    "        return wave_np\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error processing {file_path}: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "def load_and_preprocess_audio_parallel(audio_files, sampling_rate=16000, n_jobs=-1):\n",
+    "    results = Parallel(n_jobs=n_jobs, backend='loky')(\n",
+    "        delayed(process_single_audio)(file_path, sampling_rate) for file_path in audio_files\n",
+    "    )\n",
+    "    return [res for res in results if res is not None]\n",
+    "\n",
+    "\n",
+    "def setup_stable_diffusion():\n",
+    "    model_id = \"stabilityai/stable-diffusion-2-1\"\n",
+    "    pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)\n",
+    "\n",
+    "    vae = pipe.vae\n",
+    "    unet = pipe.unet\n",
+    "    scheduler = pipe.scheduler\n",
+    "\n",
+    "    # del pipe.text_encoder\n",
+    "    torch.cuda.empty_cache()\n",
+    "    \n",
+    "    vae = freeze_model(vae)\n",
+    "    unet = freeze_model(unet)\n",
+    "    \n",
+    "    print_trainable_params(vae, \"VAE\")\n",
+    "    print_trainable_params(unet, \"UNet\")\n",
+    "    return vae, unet, scheduler, pipe"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Old Dataset Class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>image_path</th>\n",
+       "      <th>audio_path</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11485</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11486</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11487</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11488</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11489</th>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "      <td>/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>73755 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              image_path  \\\n",
+       "0      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "1      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "2      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "3      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "4      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "...                                                  ...   \n",
+       "11485  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "11486  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "11487  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "11488  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "11489  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...   \n",
+       "\n",
+       "                                              audio_path  \n",
+       "0      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "1      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "2      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "3      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "4      /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "...                                                  ...  \n",
+       "11485  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "11486  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "11487  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "11488  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "11489  /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...  \n",
+       "\n",
+       "[73755 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# # Dataset & Dataloader\n",
+    "# ==================================================================\n",
+    "#                 I M A G E - A U D I O - D A T A S E T\n",
+    "# ==================================================================\n",
+    "def denormalize_image(img_tensor):\n",
+    "    mean = np.array([0.48145466, 0.4578275, 0.40821073]).reshape(3, 1, 1)\n",
+    "    std = np.array([0.26862954, 0.26130258, 0.27577711]).reshape(3, 1, 1)\n",
+    "    \n",
+    "    img = img_tensor * std + mean       # de-normalize\n",
+    "    img = np.clip(img, 0, 1)            # clip to [0, 1] for display\n",
+    "    img = np.transpose(img, (1, 2, 0))  # CHW -> HWC\n",
+    "    return img\n",
+    "\n",
+    "class VaaniImageAudioDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, df):\n",
+    "        self.image_paths = df.image_path.tolist()\n",
+    "        self.audio_paths = df.audio_path.tolist()\n",
+    "        self.image_transforms = v2.Compose([\n",
+    "            v2.ToImage(),\n",
+    "            v2.Resize((224, 224), antialias=True),\n",
+    "            v2.RandomCrop(size=(224, 224)),\n",
+    "            v2.ToDtype(torch.float16, scale=True),\n",
+    "            v2.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], \n",
+    "                        std=[0.26862954, 0.26130258, 0.27577711])\n",
+    "        ])\n",
+    "        \n",
+    "        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(\"openai/whisper-large-v2\")\n",
+    "        self.sampling_rate = self.feature_extractor.sampling_rate\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.audio_paths)\n",
+    "    \n",
+    "    def get_image_tensor(self, image_path):\n",
+    "        return self.image_transforms(Image.open(image_path).convert('RGB')) \n",
+    "\n",
+    "    def get_audio_tensor(self, audio_path):\n",
+    "        waveform = process_single_audio(audio_path, sampling_rate=self.sampling_rate)\n",
+    "        return self.feature_extractor(waveform, sampling_rate=self.sampling_rate, return_tensors=\"pt\").input_features\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        return {\n",
+    "            'image_path': self.image_paths[idx],\n",
+    "            'image_tensor': self.get_image_tensor(self.image_paths[idx]),\n",
+    "            'audio_path': self.audio_paths[idx],\n",
+    "            'audio_tensor': self.get_audio_tensor(self.audio_paths[idx])\n",
+    "        }\n",
+    "    \n",
+    "     \n",
+    "\n",
+    "train_df = pd.read_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Img_Audio_Alignment/available_img_audios_TRAIN3.csv\")\n",
+    "test_df = pd.read_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Img_Audio_Alignment/available_img_audios_TEST2.csv\")\n",
+    "audio_tensors_savedir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Audio_tensors/'\n",
+    "\n",
+    "df = pd.concat([train_df, test_df], axis=0)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# savedir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Image_Audio_SD21_Whisper_features/'\n",
+    "# done = [i.split('.')[:-2] for i in os.listdir(savedir) if i.endswith('.pt')]\n",
+    "# len(done)\n",
+    "# print(done[:3])\n",
+    "\n",
+    "# df['done'] = df['image_path'].apply(lambda x: os.path.basename(x).split('.')[:-1] in done)\n",
+    "# print(df.done.value_counts())\n",
+    "\n",
+    "# df = df[df['done'] == False]\n",
+    "# df.drop(columns=['done'], inplace=True)\n",
+    "# df = df.reset_index(drop=True)\n",
+    "# df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Length of Train dataset: 73755\n",
+      "Total batches: 2305\n",
+      " 73755\n",
+      "Total batches: 2305\n",
+      "Image batch shape: torch.Size([32, 3, 224, 224])\n",
+      "Audio batch shape: torch.Size([32, 1, 80, 3000])\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = VaaniImageAudioDataset(df)\n",
+    "\n",
+    "# s = 0.009\n",
+    "# dataset, _ = torch.utils.data.random_split(dataset, [s, 1-s], torch.manual_seed(42))\n",
+    "\n",
+    "print(\"Length of Train dataset:\", len(dataset))\n",
+    "\n",
+    "\n",
+    "BATCH_SIZE = int(32)\n",
+    "dataloader = torch.utils.data.DataLoader(\n",
+    "    dataset,\n",
+    "    batch_size=BATCH_SIZE, \n",
+    "    shuffle=False, \n",
+    "    num_workers=48,\n",
+    "    pin_memory=False,\n",
+    "    drop_last=False,\n",
+    "    persistent_workers=True\n",
+    ")\n",
+    "print('Total batches:', len(dataloader))\n",
+    "\n",
+    "batch = next(iter(dataloader))\n",
+    "image_tensor_batch = batch['image_tensor'].to(device=device)\n",
+    "audio_tensor_batch = batch['audio_tensor'].to(device=device)\n",
+    "image_paths_batch = batch['image_path']\n",
+    "audio_paths_batch = batch['audio_path']\n",
+    "print(\"Image batch shape:\", image_tensor_batch.shape)\n",
+    "print(\"Audio batch shape:\", audio_tensor_batch.shape)\n",
+    "# for batch in tqdm(dataloader):\n",
+    "#     pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preparing Whisper Audio Encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class WhisperEncoder2(nn.Module):\n",
+    "    def __init__(\n",
+    "        self, \n",
+    "        encoder, \n",
+    "        input_dim=1280, \n",
+    "        output_dim=1024, \n",
+    "        n_heads=8, \n",
+    "        num_layers=2, \n",
+    "        dropout=0.1\n",
+    "    ):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.encoder = encoder.eval()\n",
+    "        for param in self.encoder.parameters():\n",
+    "            param.requires_grad = False\n",
+    "\n",
+    "        # Learnable query token to act like CLS\n",
+    "        self.query = nn.Parameter(torch.randn(1, 1, input_dim))  # [1, 1, D]\n",
+    "\n",
+    "        encoder_layer = nn.TransformerEncoderLayer(\n",
+    "            d_model=input_dim, \n",
+    "            nhead=n_heads, \n",
+    "            dim_feedforward=input_dim * 4, \n",
+    "            dropout=dropout, \n",
+    "            batch_first=True\n",
+    "        )\n",
+    "        self.transformer = nn.TransformerEncoder(\n",
+    "            encoder_layer, \n",
+    "            num_layers=num_layers\n",
+    "        )\n",
+    "\n",
+    "        self.proj = nn.Linear(input_dim, output_dim)\n",
+    "\n",
+    "    def forward(self, input_features):\n",
+    "        with torch.no_grad():\n",
+    "            encoder_outputs = self.encoder(input_features=input_features)\n",
+    "            hidden_states = encoder_outputs.last_hidden_state  # [B, T, D]\n",
+    "\n",
+    "        B = hidden_states.size(0)\n",
+    "\n",
+    "        # Expand learnable query to match batch size\n",
+    "        query = self.query.expand(B, -1, -1)  # [B, 1, D]\n",
+    "        x = torch.cat([query, hidden_states], dim=1)  # [B, 1+T, D]\n",
+    "\n",
+    "        x = self.transformer(x)  # [B, 1+T, D]\n",
+    "        pooled = x[:, 0:1, :]    # Take output of query token only\n",
+    "\n",
+    "        return self.proj(pooled)  # [B, 1, output_dim]\n",
+    "\n",
+    "\n",
+    "whisper_model = WhisperModel.from_pretrained(\n",
+    "    pretrained_model_name_or_path=\"openai/whisper-large-v2\",\n",
+    "    cache_dir='/scratch/IITB/ai-at-ieor/23m1521/hf_cache/'\n",
+    "    )\n",
+    "\n",
+    "# audio_encoder = WhisperEncoder2(encoder=whisper_model.encoder).to(device)\n",
+    "# whisper_encoder = freeze_model(whisper_model.encoder).eval()\n",
+    "\n",
+    "whisper_encoder = torch.compile(\n",
+    "    freeze_model(whisper_model.encoder), \n",
+    "    backend=\"aot_eager\"\n",
+    "    ).eval().to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train Image Features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cuda\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "76117f359bd14657904178fb83c3966d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "[Extracting Features]:   0%|          | 0/2305 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import gc\n",
+    "def force_gc():\n",
+    "    gc.collect()\n",
+    "    torch.cuda.empty_cache()\n",
+    "    # torch.cuda.ipc_collect()  # Optional: cleans up interprocess caches\n",
+    "\n",
+    "\n",
+    "savedir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Image_Audio_SD21_Whisper_features/'\n",
+    "os.makedirs(savedir, exist_ok=True)\n",
+    "\n",
+    "train_loop = tqdm(dataloader, desc=f\"[Extracting Features]\", colour='blue', dynamic_ncols=True)\n",
+    "for i, batch in enumerate(train_loop):\n",
+    "    # if i == 1:break\n",
+    "    \n",
+    "    with torch.cuda.amp.autocast():\n",
+    "\n",
+    "        image_paths_batch = batch['image_path']\n",
+    "        image_tensor_batch = batch['image_tensor'].to(device=device)\n",
+    "        \n",
+    "        audio_tensor_batch = batch['audio_tensor'].squeeze(1).to(device=device)\n",
+    "        audio_paths_batch = batch['audio_path']\n",
+    "        with torch.no_grad():\n",
+    "            encoder_outputs = whisper_encoder(input_features=audio_tensor_batch)\n",
+    "            hidden_states = encoder_outputs.last_hidden_state\n",
+    "        \n",
+    "\n",
+    "        for i in range(len(image_paths_batch)):\n",
+    "            torch.save({\n",
+    "                'image_path': image_paths_batch[i],\n",
+    "                'image_features': image_tensor_batch[i].detach().cpu(),\n",
+    "                'audio_path': audio_paths_batch[i],\n",
+    "                'audio_features': hidden_states[i].detach().cpu(),\n",
+    "            }, os.path.join(savedir, f\"{os.path.basename(image_paths_batch[i])}.pt\")\n",
+    "                )\n",
+    "            \n",
+    "        if i % 20 == 0:\n",
+    "            del image_tensor_batch, audio_tensor_batch, hidden_states\n",
+    "            force_gc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !rm -rf '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Image_Audio_SD21_Whisper_features/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "73755"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "savedir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Image_Audio_SD21_Whisper_features/'\n",
+    "\n",
+    "len(os.listdir(savedir))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "549G\t/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Image_Audio_SD21_Whisper_features/\n"
+     ]
+    }
+   ],
+   "source": [
+    "!du -sh /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Image_Audio_SD21_Whisper_features/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## New Dataset Class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'image_path': '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_Varanasi-SPECIFIC_01655.jpg',\n",
+       " 'image_features': tensor([[[ 1.1123,  1.1123,  1.1123,  ...,  0.9526,  0.9526,  0.9380],\n",
+       "          [ 1.1270,  1.1123,  1.1123,  ...,  0.9526,  0.9526,  0.9526],\n",
+       "          [ 1.1270,  1.1270,  1.1123,  ...,  0.9526,  0.9526,  0.9526],\n",
+       "          ...,\n",
+       "          [-0.4346, -0.4199, -0.4346,  ..., -1.1943, -1.1650, -1.1797],\n",
+       "          [-0.4783, -0.4636, -0.4490,  ..., -1.2383, -1.1650, -1.1797],\n",
+       "          [-0.4346, -0.4490, -0.4927,  ..., -1.2529, -1.1797, -1.1943]],\n",
+       " \n",
+       "         [[ 1.4307,  1.4307,  1.4307,  ...,  1.3545,  1.3545,  1.3389],\n",
+       "          [ 1.4453,  1.4307,  1.4307,  ...,  1.3545,  1.3545,  1.3545],\n",
+       "          [ 1.4453,  1.4453,  1.4307,  ...,  1.3545,  1.3545,  1.3545],\n",
+       "          ...,\n",
+       "          [-0.5513, -0.5366, -0.5366,  ..., -1.2715, -1.2422, -1.2568],\n",
+       "          [-0.5815, -0.5664, -0.5513,  ..., -1.3164, -1.2422, -1.2568],\n",
+       "          [-0.5366, -0.5513, -0.5962,  ..., -1.3320, -1.2568, -1.2715]],\n",
+       " \n",
+       "         [[ 1.6621,  1.6621,  1.6621,  ...,  1.6621,  1.6621,  1.6475],\n",
+       "          [ 1.6758,  1.6621,  1.6621,  ...,  1.6621,  1.6621,  1.6621],\n",
+       "          [ 1.6758,  1.6758,  1.6621,  ...,  1.6621,  1.6621,  1.6621],\n",
+       "          ...,\n",
+       "          [-0.5132, -0.4990, -0.4990,  ..., -1.1377, -1.0811, -1.1240],\n",
+       "          [-0.5415, -0.5273, -0.5132,  ..., -1.1660, -1.0811, -1.1377],\n",
+       "          [-0.4990, -0.5132, -0.5557,  ..., -1.1816, -1.0957, -1.1523]]],\n",
+       "        dtype=torch.float16),\n",
+       " 'audio_path': '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/UttarPradesh_Varanasi/IISc_VaaniProject_M_UP_Varanasi_18587414_0917000000_UPVNTA_123286_6733_8870.wav',\n",
+       " 'audio_features': tensor([[-1.1608e+00, -9.0333e-02,  3.3006e-02,  ..., -4.3307e+00,\n",
+       "          -1.4483e-01, -1.0611e+00],\n",
+       "         [ 2.5317e-01, -2.7337e-01, -1.8108e-01,  ..., -3.6791e+00,\n",
+       "           5.2682e-01, -6.8573e-01],\n",
+       "         [ 4.7004e-01, -8.1346e-01,  1.0142e+00,  ..., -2.2765e+00,\n",
+       "           1.2923e+00, -8.2782e-01],\n",
+       "         ...,\n",
+       "         [-6.1619e-03, -7.1685e-03, -1.0914e-02,  ...,  6.0164e-03,\n",
+       "          -4.9124e-03, -1.9412e-03],\n",
+       "         [-2.5727e-03, -2.7489e-03, -9.8100e-03,  ..., -5.9428e-03,\n",
+       "          -1.4006e-03,  4.9841e-04],\n",
+       "         [-2.5339e-03, -1.1025e-02, -1.6143e-02,  ..., -8.3381e-03,\n",
+       "           5.2792e-04,  1.2501e-02]])}"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "idx = 1\n",
+    "torch.load(features_paths[idx])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==================================================================\n",
+    "#                 I M A G E - A U D I O - D A T A S E T\n",
+    "# ==================================================================\n",
+    "def denormalize_image(img_tensor):\n",
+    "    mean = np.array([0.48145466, 0.4578275, 0.40821073]).reshape(3, 1, 1)\n",
+    "    std = np.array([0.26862954, 0.26130258, 0.27577711]).reshape(3, 1, 1)\n",
+    "    \n",
+    "    img = img_tensor * std + mean       # de-normalize\n",
+    "    img = np.clip(img, 0, 1)            # clip to [0, 1] for display\n",
+    "    img = np.transpose(img, (1, 2, 0))  # CHW -> HWC\n",
+    "    return img\n",
+    "\n",
+    "class VaaniImageAudioDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, features_paths):\n",
+    "        self.features_paths = features_paths\n",
+    "        self.image_transforms = v2.Compose([\n",
+    "            v2.ToImage(),\n",
+    "            v2.Resize((224, 224), antialias=True),\n",
+    "            v2.RandomCrop(size=(224, 224)),\n",
+    "            v2.ToDtype(torch.float16, scale=True),\n",
+    "            v2.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], \n",
+    "                        std=[0.26862954, 0.26130258, 0.27577711])\n",
+    "        ])\n",
+    "        \n",
+    "        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(\"openai/whisper-large-v2\")\n",
+    "        self.sampling_rate = self.feature_extractor.sampling_rate\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.features_paths)\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        return torch.load(self.features_paths[idx])\n",
+    "    \n",
+    "     \n",
+    "\n",
+    "train_df = pd.read_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Img_Audio_Alignment/available_img_audios_TRAIN3.csv\")\n",
+    "test_df = pd.read_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Img_Audio_Alignment/available_img_audios_TEST2.csv\")\n",
+    "audio_tensors_savedir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Audio_tensors/'\n",
+    "features_savedir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Image_Audio_SD21_Whisper_features/'\n",
+    "features_paths = [f\"{features_savedir}/{i}\" for i in os.listdir(features_savedir)]\n",
+    "\n",
+    "df = pd.concat([train_df, test_df], axis=0)\n",
+    "dataset = VaaniImageAudioDataset(features_paths)\n",
+    "\n",
+    "s = 0.005\n",
+    "dataset, _ = torch.utils.data.random_split(dataset, [s, 1-s], torch.manual_seed(42))\n",
+    "\n",
+    "print(\"Length of Train dataset:\", len(dataset))\n",
+    "\n",
+    "\n",
+    "BATCH_SIZE = int(64)\n",
+    "dataloader = torch.utils.data.DataLoader(\n",
+    "    dataset,\n",
+    "    batch_size=BATCH_SIZE, \n",
+    "    shuffle=True, \n",
+    "    num_workers=48,\n",
+    "    pin_memory=True,\n",
+    "    drop_last=False,\n",
+    "    prefetch_factor=5,\n",
+    "    persistent_workers=True\n",
+    ")\n",
+    "print('Total batches:', len(dataloader))\n",
+    "\n",
+    "batch = next(iter(dataloader))\n",
+    "image_tensor_batch = batch['image_tensor'].to(device=device)\n",
+    "audio_tensor_batch = batch['audio_tensor'].to(device=device)\n",
+    "image_paths_batch = batch['image_path']\n",
+    "audio_paths_batch = batch['audio_path']\n",
+    "print(\"Image batch shape:\", image_tensor_batch.shape)\n",
+    "print(\"Audio batch shape:\", audio_tensor_batch.shape)\n",
+    "# for batch in tqdm(dataloader):\n",
+    "#     pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Dataset: 26810\n",
+      "Test Dataset: 11490\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'image_path': '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_Lucknow-SPECIFIC_00826.jpg',\n",
+       " 'image_feature': tensor([-0.1034,  0.4547, -0.3613,  ..., -0.4897, -0.0025,  0.6462]),\n",
+       " 'audio_path': '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/UttarPradesh_Lucknow/IISc_VaaniProject_K_UttarPradesh_Lucknow_Lucknow844425030382473_010_Lucknow-SPECIFIC_00826_4706_6462.wav',\n",
+       " 'audio_tensor': tensor([-0.0131, -0.0133, -0.0105,  ..., -0.0070, -0.0086, -0.0096])}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# ==================================================================\n",
+    "#                 I M A G E - A U D I O - D A T A S E T\n",
+    "# ==================================================================\n",
+    "class VaaniImageAudioDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, df, image_features_savedir, audio_tensors_savedir):\n",
+    "        self.image_paths = df.image_path.tolist()\n",
+    "        self.audio_paths = df.audio_path.tolist()\n",
+    "        self.image_features_savedir = image_features_savedir\n",
+    "        self.audio_tensors_savedir = audio_tensors_savedir\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.audio_paths)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return {\n",
+    "            'image_path': self.image_paths[idx],\n",
+    "            'image_feature': torch.load(os.path.join(\n",
+    "                self.image_features_savedir, \n",
+    "                f\"{os.path.basename(self.image_paths[idx])}.pt\"))['image_features'],\n",
+    "            'audio_path': self.audio_paths[idx],\n",
+    "            'audio_tensor': torch.load(os.path.join(\n",
+    "                audio_tensors_savedir, \n",
+    "                f\"{os.path.basename(self.audio_paths[idx])}.pt\"))['audio_tensor']\n",
+    "        }\n",
+    "    \n",
+    "\n",
+    "train_df = pd.read_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Img_Audio_Alignment/available_img_audios_TRAIN2.csv\")\n",
+    "test_df = pd.read_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Img_Audio_Alignment/available_img_audios_TEST2.csv\")\n",
+    "image_features_savedir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Image_features/'\n",
+    "audio_tensors_savedir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Hindi_Audio_tensors/'\n",
+    "train_dataset = VaaniImageAudioDataset(train_df, image_features_savedir, audio_tensors_savedir)\n",
+    "test_dataset = VaaniImageAudioDataset(test_df, image_features_savedir, audio_tensors_savedir)\n",
+    "\n",
+    "print('Train Dataset:', len(train_dataset))\n",
+    "print('Test Dataset:', len(test_dataset))\n",
+    "train_dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Image batch shape: torch.Size([64, 1024])\n",
+      "Audio batch shape: torch.Size([64, 308700])\n"
+     ]
+    }
+   ],
+   "source": [
+    "BATCH_SIZE = int(64)\n",
+    "train_dataloader = torch.utils.data.DataLoader(\n",
+    "    train_dataset,\n",
+    "    batch_size=BATCH_SIZE, \n",
+    "    shuffle=True, \n",
+    "    num_workers=48,\n",
+    "    pin_memory=True,\n",
+    "    drop_last=False,\n",
+    "    persistent_workers=True\n",
+    ")\n",
+    "\n",
+    "test_dataloader = torch.utils.data.DataLoader(\n",
+    "    test_dataset,\n",
+    "    batch_size=BATCH_SIZE, \n",
+    "    shuffle=False, \n",
+    "    num_workers=48,\n",
+    "    pin_memory=True,\n",
+    "    drop_last=False,\n",
+    "    persistent_workers=True\n",
+    ")\n",
+    "\n",
+    "batch = next(iter(train_dataloader))\n",
+    "image_features_batch = batch['image_feature'].to(device=device)\n",
+    "audio_tensor_batch = batch['audio_tensor'].to(device=device)\n",
+    "image_paths_batch = batch['image_path']\n",
+    "audio_paths_batch = batch['audio_path']\n",
+    "print(\"Image batch shape:\", image_features_batch.shape) # [BATCH_SIZE, 3, 224, 224]\n",
+    "print(\"Audio batch shape:\", audio_tensor_batch.shape) # [BATCH_SIZE, 1, 44100]\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "clap",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/SDFT/SD21_Whisper/config-SD21_Whisper.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ learning_rate: 1e-7

scratch/IITB/ai-at-ieor/23m1521/SDFT/Vaani/_1.1_Audio-Hindi-Download.ipynb CHANGED Viewed

@@ -1045,7 +1045,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },

  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "aku",
    "language": "python",
    "name": "python3"
   },