# !pip install huggingface_hub # !pip install datasets """ 1. **Andhra Pradesh [6]** Anantpur, Chittoor, Guntur, Krishna, Srikakulam, Vishakapattanam 2. **Bihar [20]** Araria, Begusarai, Bhagalpur, Darbhanga, EastChamparan, Gaya, Gopalganj, Jahanabad, Jamui, Kishanganj, Lakhisarai, Madhepura, Muzaffarpur, Purnia, Saharsa, Samastipur, Saran, Sitamarhi, Supaul, Vaishali 3. **Chhattisgarh [10]** Balrampur, Bastar, Bilaspur, Jashpur, Kabirdham, Korba, Raigarh, Rajnandgaon, Sarguja, Sukma 4. **Goa [1]** NorthSouthGoa 5. **Jharkhand [2]** Jamtara, Sahebganj 6. **Karnataka [10]** Belgaum, Bellary, Bijapur, Chamarajanagar, DakshinaKannada, Dharwad, Gulbarga, Mysore, Raichur, Shimoga 7. **Maharashtra [7]** Aurangabad, Chandrapur, Dhule, Nagpur, Pune, Sindhudurg, Solapur 8. **Rajasthan [2]** Churu, Nagaur 9. **Telangana [2]** Karimnagar, Nalgonda 10. **Uttar Pradesh [10]** Budaun, Deoria, Etah, Ghazipur, Gorakhpur, Hamirpur, Jalaun, JyotibaPhuleNagar, Muzaffarnagar, Varanasi 11. **Uttarakhand [2]** TehriGarhwal, Uttarkashi 12. **West Bengal [8]** DakshinDinajpur, Jalpaiguri, Jhargram, Kolkata, Malda, North24Parganas, PaschimMedinipur, Purulia """ # Districts in Vaani dataset (NEW) """ 1. AndhraPradesh (6) Anantpur Chittoor Guntur Krishna Srikakulam Vishakapattanam 2. ArunachalPradesh (2) Longding PapumPare 3. Assam (2) KamrupMetropolitan Sonitpur 4. Bihar (24) Araria Begusarai Bhagalpur Darbhanga EastChamparan Gaya Gopalganj Jahanabad Jamui Kaimur Katihar Kishanganj Lakhisarai Madhepura Muzaffarpur Patna Purnia Saharsa Samastipur Saran Sitamarhi Supaul Vaishali WestChamparan 5. Chandigarh (1) Chandigarh 6. Chhattisgarh (10) Balrampur Bastar Bilaspur Jashpur Kabirdham Korba Raigarh Rajnandgaon Sarguja Sukma 7. Delhi (1) NewDelhi 8. Goa (1) NorthSouthGoa 9. Jharkhand (6) Deoghar Garhwa Jamtara Palamu Ranchi Sahebganj 10. Karnataka (13) Bangalore Belgaum Bellary Bidar Bijapur Chamrajnagar DakshinKannada Dharwad Gulbarga Koppal Mysore Raichur Shimoga 11. MadhyaPradesh (3) Bhopal Dhar Katni 12. Maharashtra (7) Aurangabad Chandrapur Dhule Nagpur Pune Sindhudurga Solapur 13. Meghalaya (1) WestGaroHills 14. Nagaland (2) Dimapur Kohima 15. Odisha (1) Khordha 16. Rajasthan (3) Churu Jaipur Nagaur 17. TamilNadu (4) Chennai Kanyakumari Namakkal Nilgiris 18. Telangana (4) Hyderabad Karimnagar Mahabubabad Nalgonda 19. Tripura (3) Dhalai Unakoti WestTripura 20. UttarPradesh (13) Budaun Deoria Etah Ghazipur Gorakhpur Hamirpur Jalaun JyotibaPhuleNagar Lalitpur Lucknow Muzzaffarnagar Saharanpur Varanasi 21. Uttarakhand (2) TehriGarhwal Uttarkashi 22. WestBengal (11) Alipurduar CoochBehar DakshinDinajpur Darjeeling Jalpaiguri Jhargram Kolkata Malda North24Parganas PaschimMedinipur Purulia """ import os import subprocess import numpy as np from scipy.io.wavfile import write # type: ignore from tqdm import tqdm, trange from collections import Counter import datasets from datasets import Audio from datasets import load_dataset from datasets import get_dataset_config_names def array_to_wav(audio_array: np.ndarray, sample_rate: int, file_name: str): # Normalize if outside [-1, 1] to prevent clipping # if np.max(np.abs(audio_array)) > 1.0: # audio_array = audio_array / np.max(np.abs(audio_array)) # Convert to 16-bit PCM # audio_int16 = np.int16(audio_array * 32767) # Write to WAV file # write(file_name, sample_rate, audio_int16) write(file_name, sample_rate, audio_array) configs = get_dataset_config_names("ARTPARK-IISc/Vaani") print(configs, "\n\n") # exit() # Hindi_district = [ # 'Delhi_NewDelhi', # 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Katni', # 'Chhattisgarh_Bilaspur', # 'Maharashtra_Nagpur', # 'UttarPradesh_Varanasi', 'UttarPradesh_Lucknow', 'UttarPradesh_Gorakhpur' # ] # Hindi_district = [ # 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'Rajasthan_Churu', # 'MadhyaPradesh_Dhar', # 'Maharashtra_Pune', 'Maharashtra_Aurangabad', # 'Chhattisgarh_Raigarh', # 'UttarPradesh_Lalitpur', 'UttarPradesh_Jalaun', 'UttarPradesh_Etah', 'UttarPradesh_Deoria', 'UttarPradesh_Ghazipur' # ] # Hindi_district = [ # 'Jharkhand_Ranchi' # ] Hindi_district = [ # 'AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', # 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', # 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', # 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', # 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Sahebganj', # 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', # 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', # 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', # 'Meghalaya_WestGaroHills', # 'Nagaland_Dimapur', 'Nagaland_Kohima', # 'Odisha_Khordha', # 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', # 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', # 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Hamirpur', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia' ] missing = [i for i in Hindi_district if i not in configs] # assert not missing, f"Hindi districts {missing} not in configs" # hindi_dis_ds = [i for i in configs if i.split("_")[-1] in Hindi_district] hindi_dis_ds = [i for i in configs if i in Hindi_district] hindi_dis_ds = Hindi_district print(len(hindi_dis_ds), hindi_dis_ds, "\n\n") HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip() df_dict = {"audio_path":[], 'referenceImage':[], 'gender':[], 'state':[], 'district':[]} import os os.makedirs("/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache", exist_ok=True) os.environ["HF_DATASETS_CACHE"] = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache" # Load existing reference images if file exists ref_image_file = "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/used_reference_images_Hindi-copy_and.txt" referenceImages = set() if os.path.exists(ref_image_file): with open(ref_image_file, "r") as f: referenceImages = set([line.strip() for line in f if line.strip()]) for i, district in enumerate(hindi_dis_ds): # district = "Delhi_NewDelhi" print("-"*2, i+1, "-"*10, district, "-"*10) cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}' os.makedirs(cache_dir, exist_ok=True) ds = load_dataset("ARTPARK-IISc/Vaani", district, split="train", num_proc=20, token=HF_TOKEN, cache_dir=cache_dir, streaming=False) # Temporarily disable decoding of audio to avoid reading the actual files ds = ds.cast_column("audio", datasets.Audio(decode=False)) # Now filter safely Hindi_rows = ds.filter(lambda x: x["language"] == "Hindi") # Re-enable decoding Hindi_rows = Hindi_rows.cast_column("audio", datasets.Audio()) save_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/{district}' os.makedirs(save_dir, exist_ok=True) pbar = tqdm(total=len(Hindi_rows), dynamic_ncols=True, colour='blue') with open(ref_image_file, "a") as ref_out: for i in Hindi_rows: ref_img = i['referenceImage'] if ref_img not in referenceImages: referenceImages.add(ref_img) ref_out.write(ref_img + "\n") df_dict['audio_path'].append(i['audio']['path']) df_dict['referenceImage'].append(i['referenceImage']) df_dict['gender'].append(i['gender']) df_dict['state'].append(i['state']) df_dict['district'].append(i['district']) array_to_wav( i['audio']['array'], i['audio']['sampling_rate'], os.path.join(save_dir, i['audio']['path']) ) pbar.set_description(f"{district} referenceImages {len(referenceImages)}") pbar.refresh() pbar.update(1) pbar.close() subprocess.run(["rm", "-rf", cache_dir], check=True) subprocess.run(["rm", "-rf", "/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/hub/datasets--ARTPARK-IISc--Vaani"], check=True) subprocess.run(["rm", "-rf", "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache"], check=True) subprocess.run(["rm", "-rf", "/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/datasets"], check=True) subprocess.run(["rm", "-rf", "/scratch/IITB/ai-at-ieor/23m1521/hf_cache/hub/datasets--ARTPARK-IISc--Vaani"], check=True) os.makedirs("/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache", exist_ok=True) # break import pandas as pd pd.concat([ pd.read_csv("/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi2.csv"), pd.DataFrame(df_dict) ], axis=0, ignore_index=True ).to_csv( "Vaani-Audio-Image-Hindi4.csv", index=False ) # pd.DataFrame(df_dict).to_csv("Vaani-Audio-Image-Hindi2.csv", index=False)