Vaani-Audio2Img-LDM / Vaani /_1.1_Audio-Hindi-Download.py
alpha31476's picture
Image Audio Alingment Train OpenClip
0733b34 verified
# !pip install huggingface_hub
# !pip install datasets
"""
1. **Andhra Pradesh [6]**
Anantpur, Chittoor, Guntur, Krishna, Srikakulam, Vishakapattanam
2. **Bihar [20]**
Araria, Begusarai, Bhagalpur, Darbhanga, EastChamparan, Gaya, Gopalganj, Jahanabad, Jamui, Kishanganj, Lakhisarai,
Madhepura, Muzaffarpur, Purnia, Saharsa, Samastipur, Saran, Sitamarhi, Supaul, Vaishali
3. **Chhattisgarh [10]**
Balrampur, Bastar, Bilaspur, Jashpur, Kabirdham, Korba, Raigarh, Rajnandgaon, Sarguja, Sukma
4. **Goa [1]**
NorthSouthGoa
5. **Jharkhand [2]**
Jamtara, Sahebganj
6. **Karnataka [10]**
Belgaum, Bellary, Bijapur, Chamarajanagar, DakshinaKannada, Dharwad, Gulbarga, Mysore, Raichur, Shimoga
7. **Maharashtra [7]**
Aurangabad, Chandrapur, Dhule, Nagpur, Pune, Sindhudurg, Solapur
8. **Rajasthan [2]**
Churu, Nagaur
9. **Telangana [2]**
Karimnagar, Nalgonda
10. **Uttar Pradesh [10]**
Budaun, Deoria, Etah, Ghazipur, Gorakhpur, Hamirpur, Jalaun, JyotibaPhuleNagar, Muzaffarnagar, Varanasi
11. **Uttarakhand [2]**
TehriGarhwal, Uttarkashi
12. **West Bengal [8]**
DakshinDinajpur, Jalpaiguri, Jhargram, Kolkata, Malda, North24Parganas, PaschimMedinipur, Purulia
"""
# Districts in Vaani dataset (NEW)
"""
1. AndhraPradesh (6)
Anantpur Chittoor Guntur Krishna Srikakulam Vishakapattanam
2. ArunachalPradesh (2)
Longding PapumPare
3. Assam (2)
KamrupMetropolitan Sonitpur
4. Bihar (24)
Araria Begusarai Bhagalpur Darbhanga EastChamparan Gaya Gopalganj Jahanabad Jamui Kaimur Katihar Kishanganj
Lakhisarai Madhepura Muzaffarpur Patna Purnia Saharsa Samastipur Saran Sitamarhi Supaul Vaishali WestChamparan
5. Chandigarh (1)
Chandigarh
6. Chhattisgarh (10)
Balrampur Bastar Bilaspur Jashpur Kabirdham Korba Raigarh Rajnandgaon Sarguja Sukma
7. Delhi (1)
NewDelhi
8. Goa (1)
NorthSouthGoa
9. Jharkhand (6)
Deoghar Garhwa Jamtara Palamu Ranchi Sahebganj
10. Karnataka (13)
Bangalore Belgaum Bellary Bidar Bijapur Chamrajnagar DakshinKannada Dharwad Gulbarga Koppal Mysore Raichur Shimoga
11. MadhyaPradesh (3)
Bhopal Dhar Katni
12. Maharashtra (7)
Aurangabad Chandrapur Dhule Nagpur Pune Sindhudurga Solapur
13. Meghalaya (1)
WestGaroHills
14. Nagaland (2)
Dimapur Kohima
15. Odisha (1)
Khordha
16. Rajasthan (3)
Churu Jaipur Nagaur
17. TamilNadu (4)
Chennai Kanyakumari Namakkal Nilgiris
18. Telangana (4)
Hyderabad Karimnagar Mahabubabad Nalgonda
19. Tripura (3)
Dhalai Unakoti WestTripura
20. UttarPradesh (13)
Budaun Deoria Etah Ghazipur Gorakhpur Hamirpur Jalaun JyotibaPhuleNagar Lalitpur Lucknow
Muzzaffarnagar Saharanpur Varanasi
21. Uttarakhand (2)
TehriGarhwal Uttarkashi
22. WestBengal (11)
Alipurduar CoochBehar DakshinDinajpur Darjeeling Jalpaiguri Jhargram Kolkata Malda
North24Parganas PaschimMedinipur Purulia
"""
import os
import subprocess
import numpy as np
from scipy.io.wavfile import write # type: ignore
from tqdm import tqdm, trange
from collections import Counter
import datasets
from datasets import Audio
from datasets import load_dataset
from datasets import get_dataset_config_names
def array_to_wav(audio_array: np.ndarray, sample_rate: int, file_name: str):
# Normalize if outside [-1, 1] to prevent clipping
# if np.max(np.abs(audio_array)) > 1.0:
# audio_array = audio_array / np.max(np.abs(audio_array))
# Convert to 16-bit PCM
# audio_int16 = np.int16(audio_array * 32767)
# Write to WAV file
# write(file_name, sample_rate, audio_int16)
write(file_name, sample_rate, audio_array)
configs = get_dataset_config_names("ARTPARK-IISc/Vaani")
print(configs, "\n\n")
# exit()
# Hindi_district = [
# 'Delhi_NewDelhi',
# 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Katni',
# 'Chhattisgarh_Bilaspur',
# 'Maharashtra_Nagpur',
# 'UttarPradesh_Varanasi', 'UttarPradesh_Lucknow', 'UttarPradesh_Gorakhpur'
# ]
# Hindi_district = [
# 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'Rajasthan_Churu',
# 'MadhyaPradesh_Dhar',
# 'Maharashtra_Pune', 'Maharashtra_Aurangabad',
# 'Chhattisgarh_Raigarh',
# 'UttarPradesh_Lalitpur', 'UttarPradesh_Jalaun', 'UttarPradesh_Etah', 'UttarPradesh_Deoria', 'UttarPradesh_Ghazipur'
# ]
# Hindi_district = [
# 'Jharkhand_Ranchi'
# ]
Hindi_district = [
# 'AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur',
# 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam',
# 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare',
# 'Assam_KamrupMetropolitan', 'Assam_Sonitpur',
'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan',
'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar',
'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna',
'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi',
'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan',
'Chandigarh_Chandigarh',
'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Jashpur',
'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Rajnandgaon',
'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma',
# 'Goa_NorthSouthGoa',
'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Sahebganj',
# 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur',
# 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga',
# 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga',
'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur',
# 'Meghalaya_WestGaroHills',
# 'Nagaland_Dimapur', 'Nagaland_Kohima',
# 'Odisha_Khordha',
# 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris',
# 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda',
# 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura',
'UttarPradesh_Budaun', 'UttarPradesh_Hamirpur', 'UttarPradesh_JyotibaPhuleNagar',
'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur',
'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi',
'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur',
'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata',
'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur',
'WestBengal_Purulia'
]
missing = [i for i in Hindi_district if i not in configs]
# assert not missing, f"Hindi districts {missing} not in configs"
# hindi_dis_ds = [i for i in configs if i.split("_")[-1] in Hindi_district]
hindi_dis_ds = [i for i in configs if i in Hindi_district]
hindi_dis_ds = Hindi_district
print(len(hindi_dis_ds), hindi_dis_ds, "\n\n")
HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip()
df_dict = {"audio_path":[], 'referenceImage':[], 'gender':[], 'state':[], 'district':[]}
import os
os.makedirs("/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache", exist_ok=True)
os.environ["HF_DATASETS_CACHE"] = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache"
# Load existing reference images if file exists
ref_image_file = "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/used_reference_images_Hindi-copy_and.txt"
referenceImages = set()
if os.path.exists(ref_image_file):
with open(ref_image_file, "r") as f:
referenceImages = set([line.strip() for line in f if line.strip()])
for i, district in enumerate(hindi_dis_ds):
# district = "Delhi_NewDelhi"
print("-"*2, i+1, "-"*10, district, "-"*10)
cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}'
os.makedirs(cache_dir, exist_ok=True)
ds = load_dataset("ARTPARK-IISc/Vaani", district, split="train", num_proc=20,
token=HF_TOKEN, cache_dir=cache_dir, streaming=False)
# Temporarily disable decoding of audio to avoid reading the actual files
ds = ds.cast_column("audio", datasets.Audio(decode=False))
# Now filter safely
Hindi_rows = ds.filter(lambda x: x["language"] == "Hindi")
# Re-enable decoding
Hindi_rows = Hindi_rows.cast_column("audio", datasets.Audio())
save_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/{district}'
os.makedirs(save_dir, exist_ok=True)
pbar = tqdm(total=len(Hindi_rows), dynamic_ncols=True, colour='blue')
with open(ref_image_file, "a") as ref_out:
for i in Hindi_rows:
ref_img = i['referenceImage']
if ref_img not in referenceImages:
referenceImages.add(ref_img)
ref_out.write(ref_img + "\n")
df_dict['audio_path'].append(i['audio']['path'])
df_dict['referenceImage'].append(i['referenceImage'])
df_dict['gender'].append(i['gender'])
df_dict['state'].append(i['state'])
df_dict['district'].append(i['district'])
array_to_wav(
i['audio']['array'],
i['audio']['sampling_rate'],
os.path.join(save_dir, i['audio']['path'])
)
pbar.set_description(f"{district} referenceImages {len(referenceImages)}")
pbar.refresh()
pbar.update(1)
pbar.close()
subprocess.run(["rm", "-rf", cache_dir], check=True)
subprocess.run(["rm", "-rf", "/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/hub/datasets--ARTPARK-IISc--Vaani"], check=True)
subprocess.run(["rm", "-rf", "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache"], check=True)
subprocess.run(["rm", "-rf", "/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/datasets"], check=True)
subprocess.run(["rm", "-rf", "/scratch/IITB/ai-at-ieor/23m1521/hf_cache/hub/datasets--ARTPARK-IISc--Vaani"], check=True)
os.makedirs("/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache", exist_ok=True)
# break
import pandas as pd
pd.concat([
pd.read_csv("/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi2.csv"),
pd.DataFrame(df_dict)
],
axis=0,
ignore_index=True
).to_csv(
"Vaani-Audio-Image-Hindi4.csv",
index=False
)
# pd.DataFrame(df_dict).to_csv("Vaani-Audio-Image-Hindi2.csv", index=False)