Vaani-Audio2Img-LDM / Vaani /Vaani-data-reading-options-test.py
alpha31476's picture
Image Audio Alingment Train OpenClip With Features
632cf1e verified
# import os
# from huggingface_hub import login
# import polars as pl
# from tqdm import tqdm
# from datasets import get_dataset_config_names
# import pandas as pd
# from joblib import Parallel, delayed
# # Authenticate HuggingFace
# HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip()
# os.environ["HF_TOKEN"] = HF_TOKEN
# login(token=HF_TOKEN)
# # Get dataset configs
# configs = get_dataset_config_names("ARTPARK-IISc/Vaani")
# print(configs, "\n\n")
# # Prepare URLs and metadata
# urls_dict = {'state': [], 'district': [], 'url': []}
# for i in configs:
# state = i.split('_')[0]
# district = i.split('_')[-1]
# urls_dict['state'].append(state)
# urls_dict['district'].append(district)
# urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet")
# urls_df = pd.DataFrame(urls_dict)
# urls_df = urls_df.iloc[11:,:]
# # Output directory
# savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData"
# os.makedirs(savedir, exist_ok=True)
# # Function to process each config
# def make_filter_df(row):
# state = row['state']
# district = row['district']
# url = row['url']
# try:
# df = pl.scan_parquet(url).with_columns(
# pl.col('audio').struct.field('path').alias('audio_path')
# ).drop('audio').collect()
# df.write_parquet(f"{savedir}/{state}_{district}_meta.parquet", compression="gzip")
# print(f"βœ… Saved {state}_{district}_meta.parquet")
# except Exception as e:
# print(f"❌ Error processing {state}_{district}: {e}")
# # Parallel processing using joblib
# Parallel(n_jobs=20, backend="loky")(
# delayed(make_filter_df)(row) for _, row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70)
# )
##################################################################################################################################
# import os
# from huggingface_hub import login
# import polars as pl
# from tqdm import tqdm, trange
# from datasets import get_dataset_config_names
# import pandas as pd
# # HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip()
# # os.environ["HF_TOKEN"] = HF_TOKEN
# # login(token=HF_TOKEN)
# # configs = get_dataset_config_names("ARTPARK-IISc/Vaani")
# configs = ['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images']
# print(configs, "\n\n")
# urls_dict = {'state': [], 'district': [], 'url': [], 'config_name':[]}
# for i in configs:
# state = i.split('_')[0]
# district = i.split('_')[-1]
# urls_dict['state'].append(state)
# urls_dict['district'].append(district)
# urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet")
# urls_dict['config_name'].append(i)
# urls_df = pd.DataFrame(urls_dict)
# # def make_filter_df(state, district, url, savedir):
# # df = pl.scan_parquet(url).with_columns(
# # pl.col('audio').struct.field('path').alias('audio_path')
# # ).drop('audio').collect(
# # ).write_parquet(
# # f"{savedir}/{state}_{district}_meta.parquet",
# # compression="gzip"
# # )
# # print(f"βœ… Saved {state}_{district}_meta.parquet")
# # def make_filter_df_eager(state, district, url, savedir):
# # df = pl.read_parquet(url).with_columns(
# # pl.col('audio').struct.field('path').alias('audio_path')
# # ).drop('audio')
# # output_path = f"{savedir}/{state}_{district}_meta.parquet"
# # df.write_parquet(output_path, compression="gzip")
# # print(f"βœ… Saved {state}_{district}_meta.parquet")
# savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData"
# os.makedirs(savedir, exist_ok=True)
# done = ["_".join(i.split(".")[:-1][0].split("_")[:-1]) for i in sorted(os.listdir(savedir))]
# done
# for row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70):
# state = row[1]['state']
# district = row[1]['district']
# url = row[1]['url']
# config_name = row[1]['config_name']
# if config_name in done:
# print("⚠️ Aleardy Done ", state, district, url)
# else:
# print("πŸƒ Running ", state, district, url)
# make_filter_df_eager(state, district, url, savedir)
# done.append(config_name)
##################################################################################################################################################################################
# import os
# from huggingface_hub import login
# import polars as pl
# from tqdm import tqdm, trange
# from datasets import get_dataset_config_names, load_dataset, Audio
# import pandas as pd
# import subprocess
# HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip()
# configs = ['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images']
# print(configs, "\n\n")
# urls_dict = {'state': [], 'district': [], 'url': [], 'config_name':[]}
# for i in configs:
# state = i.split('_')[0]
# district = i.split('_')[-1]
# urls_dict['state'].append(state)
# urls_dict['district'].append(district)
# urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet")
# urls_dict['config_name'].append(i)
# urls_df = pd.DataFrame(urls_dict)
# def make_filter_df_from_hf(state, district, cache_dir, savedir, hf_token=HF_TOKEN):
# ds = load_dataset(
# "ARTPARK-IISc/Vaani",
# district,
# split="train",
# num_proc=20,
# token=hf_token,
# cache_dir=cache_dir,
# streaming=False
# )
# ds = ds.cast_column("audio", Audio(decode=False))
# audio_paths = []
# columns = {col: [] for col in ds.column_names if col != "audio"}
# for row in tqdm(ds, desc=f"Processing {state}_{district}"):
# audio_paths.append(row["audio"]["path"])
# for col in columns:
# columns[col].append(row[col])
# df = pl.DataFrame({
# "audio_path": audio_paths,
# **columns
# })
# os.makedirs(savedir, exist_ok=True)
# out_path = os.path.join(savedir, f"{state}_{district}_meta.parquet")
# df.write_parquet(out_path, compression="gzip")
# print(f"βœ… Saved {state}_{district}_meta.parquet to {out_path}")
# savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData"
# os.makedirs(savedir, exist_ok=True)
# done = ["_".join(i.split(".")[:-1][0].split("_")[:-1]) for i in sorted(os.listdir(savedir))]
# done
# for row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70):
# state = row[1]['state']
# district = row[1]['district']
# url = row[1]['url']
# config_name = row[1]['config_name']
# cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}'
# if config_name in done:
# print("⚠️ Aleardy Done ", state, district, url)
# else:
# print("πŸƒ Running ", state, district, url)
# make_filter_df_from_hf(state, district, cache_dir, savedir)
# subprocess.run(["rm", "-rf", cache_dir], check=True)
# done.append(config_name)
##################################################################################################################################################################################
from joblib import Parallel, delayed
import os
from huggingface_hub import login
import polars as pl
from tqdm import tqdm
from datasets import load_dataset, Audio
import pandas as pd
from time import sleep
import multiprocessing
import subprocess
# Increase timeout for HF downloads
os.environ["HF_HUB_TIMEOUT"] = "120" # seconds
multiprocessing.set_start_method("fork", force=True)
# Token setup
with open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r") as f:
HF_TOKEN = f.read().strip()
# Cache and save directories
savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData"
os.makedirs(savedir, exist_ok=True)
# Dataset configs (add all desired configs here)
configs = [
# Andhra Pradesh
'AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna',
'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam',
# Arunachal Pradesh
'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare',
# Assam
'Assam_KamrupMetropolitan', 'Assam_Sonitpur',
# Bihar
'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan',
'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar',
'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna',
'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul',
'Bihar_Vaishali', 'Bihar_WestChamparan',
# Chandigarh
'Chandigarh_Chandigarh',
# Chhattisgarh
'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur',
'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon',
'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma',
# Delhi
'Delhi_NewDelhi',
# Goa
'Goa_NorthSouthGoa',
# Jharkhand
'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu',
'Jharkhand_Ranchi', 'Jharkhand_Sahebganj',
# Karnataka
'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur',
'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga',
'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga',
# Madhya Pradesh
'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni',
# Maharashtra
'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur',
'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur',
# Meghalaya
'Meghalaya_WestGaroHills',
# Nagaland
'Nagaland_Dimapur', 'Nagaland_Kohima',
# Odisha
'Odisha_Khordha',
# Rajasthan
'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur',
# Tamil Nadu
'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris',
# Telangana
'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda',
# Tripura
'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura',
# Uttar Pradesh
'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur',
'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun',
'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow',
'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi',
# Uttarakhand
'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi',
# West Bengal
'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur',
'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata',
'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia'
]
# Build DataFrame for configs
urls_dict = {'state': [], 'district': [], 'config_name': []}
for config in configs:
state, district = config.split('_')
urls_dict['state'].append(state)
urls_dict['district'].append(district)
urls_dict['config_name'].append(config)
urls_df = pd.DataFrame(urls_dict)
# Get already processed configs
done = ["_".join(fname.split(".")[0].split("_")[:-1]) for fname in os.listdir(savedir) if fname.endswith(".parquet")]
urls_df = urls_df[~urls_df['config_name'].isin(done)]
# Function to process each config
def make_filter_df_from_hf(state, district, config_district, savedir, hf_token=HF_TOKEN, retries=3):
cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}'
os.makedirs(cache_dir, exist_ok=True)
print('\n\n', "-"*10, config_district, "-"*10)
# for attempt in range(retries):
attempt = 0
while True:
try:
ds = load_dataset(
"ARTPARK-IISc/Vaani",
config_district,
split="train",
num_proc=48,
token=hf_token,
cache_dir=cache_dir,
streaming=False
)
ds = ds.cast_column("audio", Audio(decode=False))
audio_paths = []
columns = {col: [] for col in ds.column_names if col != "audio"}
for row in tqdm(ds, desc=f"Processing {state}_{district}"):
audio_paths.append(row["audio"]["path"])
for col in columns:
columns[col].append(row[col])
df = pl.DataFrame({"audio_path": audio_paths, **columns})
os.makedirs(savedir, exist_ok=True)
out_path = os.path.join(savedir, f"{state}_{district}_meta.parquet")
df.write_parquet(out_path, compression="gzip")
subprocess.run(["rm", "-rf", cache_dir, '/scratch/IITB/ai-at-ieor/23m1521/hf_cache/hub/datasets--ARTPARK-IISc--Vaani/blobs/'], check=True)
print(f"βœ… Saved {state}_{district}_meta.parquet to {out_path}")
break
except Exception as e:
attempt += 1
print(f"❌ Failed {config_district} on attempt {attempt} with error: {e}")
sleep(5)
# if attempt < retries - 1:
# else:
# print(f"🚨 Giving up on {state}_{district} after {retries} attempts")
# Parallel execution
Parallel(n_jobs=1, backend="loky")(
delayed(make_filter_df_from_hf)(
row['state'], row['district'], row['config_name'], savedir
)
for _, row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='green', ncols=70)
)