In [11]:
import os
import json
from pyspark.sql import SparkSession
import pandas as pd
import polars as pl
from tqdm.auto import tqdm, trange
from concurrent.futures import ThreadPoolExecutor, as_completed
import matplotlib.pyplot as plt

import torch
import torchvision as tv
from torchvision.transforms import v2

SCRATCH = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani"
DATADIR = r"/home/IITB/ai-at-ieor/23m1521/datasets/Vaani"
JSON_PATH = os.path.join(DATADIR, "Vaani_IIsc_Artpark_Full_Data.json")
IMAGES_PATH = os.path.join(SCRATCH, "Images")
IMAGES_PARQUETS = os.path.join(SCRATCH, "images_parquets")
AUDIO_URLS = "audio_urls.txt"
IMAGES_URLS = "images_urls.txt"
IMAGE_ROOT_URL = 'https://vaani.iisc.ac.in/'
METADATA_PATH = r"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Images-Audio-MetaData.parquet"

In [7]:
def walkDIR(folder_path, include=None):
    file_list = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if include is None or any(file.endswith(ext) for ext in include):
                file_list.append(os.path.join(root, file))
    print("Files found:", len(file_list))
    return file_list

files = walkDIR(IMAGES_PATH, include=['.png', '.jpeg', '.jpg'])

Files found: 128807


In [8]:
files[:10]

['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_TehriGarhwal-SPECIFIC_00863.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Budaun-SPECIFIC_00129.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Jashpur-SPECIFIC_00102.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Saharsa-SPECIFIC_00905.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Purulia-SPECIFIC_00971.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Samastipur-SPECIFIC_00274.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_PaschimMedinipur-SPECIFIC_00501.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Sahebganj-SPECIFIC_00315.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan

In [None]:
class MnistDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data,
        im_size
    ):
        if isinstance(data, str):
            self.data = pd.read_csv(data)
        elif isinstance(data, pd.DataFrame):
            self.data = data
        else:
            raise ValueError("The `data` argument must be a string (CSV file path) or a Pandas DataFrame.")
        
        self.im_size = im_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = row['image_path']
        label = int(row['label'])

        image = tv.io.decode_image(image_path, mode='RGB')
        image = v2.Resize(self.im_size)(image)
        image = v2.ToDtype(torch.float32, scale=True)(image)
        image = 2*image - 1

        return image, label

In [3]:
df = pd.read_parquet(METADATA_PATH)

In [4]:
df

Unnamed: 0,image_name,state,district,gender,audio_language,audio_name
0,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
1,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
2,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
3,Images/IISc_VaaniProject_GENERIC_0073.jpg,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
4,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
...,...,...,...,...,...,...
9584927,Images/IISc_VaaniProject_GENERIC_0554.jpg,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584928,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584929,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584930,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...


In [22]:
p = os.path.join('Images', os.path.basename(files[0]))
p

'Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg'

In [23]:
df[df.image_name == p]

Unnamed: 0,image_name,state,district,gender,audio_language,audio_name
293605,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,male,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_1...
746685,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,male,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_6...
746694,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,male,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_6...
855359,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,female,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_1...
1156285,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,female,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_6...
1156411,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,female,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_6...
1157007,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,female,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_1...
1157034,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,female,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_1...
1157692,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,male,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_1...
1157740,Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg,Bihar,Saran,male,Hindi,Audios/Saran/IISc_VaaniProject_S_Bihar_Saran_1...
