|
|
|
|
|
|
|
|
|
""" |
|
1. **Andhra Pradesh [6]** |
|
Anantpur, Chittoor, Guntur, Krishna, Srikakulam, Vishakapattanam |
|
|
|
2. **Bihar [20]** |
|
Araria, Begusarai, Bhagalpur, Darbhanga, EastChamparan, Gaya, Gopalganj, Jahanabad, Jamui, Kishanganj, Lakhisarai, |
|
Madhepura, Muzaffarpur, Purnia, Saharsa, Samastipur, Saran, Sitamarhi, Supaul, Vaishali |
|
|
|
3. **Chhattisgarh [10]** |
|
Balrampur, Bastar, Bilaspur, Jashpur, Kabirdham, Korba, Raigarh, Rajnandgaon, Sarguja, Sukma |
|
|
|
4. **Goa [1]** |
|
NorthSouthGoa |
|
|
|
5. **Jharkhand [2]** |
|
Jamtara, Sahebganj |
|
|
|
6. **Karnataka [10]** |
|
Belgaum, Bellary, Bijapur, Chamarajanagar, DakshinaKannada, Dharwad, Gulbarga, Mysore, Raichur, Shimoga |
|
|
|
7. **Maharashtra [7]** |
|
Aurangabad, Chandrapur, Dhule, Nagpur, Pune, Sindhudurg, Solapur |
|
|
|
8. **Rajasthan [2]** |
|
Churu, Nagaur |
|
|
|
9. **Telangana [2]** |
|
Karimnagar, Nalgonda |
|
|
|
10. **Uttar Pradesh [10]** |
|
Budaun, Deoria, Etah, Ghazipur, Gorakhpur, Hamirpur, Jalaun, JyotibaPhuleNagar, Muzaffarnagar, Varanasi |
|
|
|
11. **Uttarakhand [2]** |
|
TehriGarhwal, Uttarkashi |
|
|
|
12. **West Bengal [8]** |
|
DakshinDinajpur, Jalpaiguri, Jhargram, Kolkata, Malda, North24Parganas, PaschimMedinipur, Purulia |
|
""" |
|
|
|
|
|
""" |
|
1. AndhraPradesh (6) |
|
Anantpur Chittoor Guntur Krishna Srikakulam Vishakapattanam |
|
|
|
2. ArunachalPradesh (2) |
|
Longding PapumPare |
|
|
|
3. Assam (2) |
|
KamrupMetropolitan Sonitpur |
|
|
|
4. Bihar (24) |
|
Araria Begusarai Bhagalpur Darbhanga EastChamparan Gaya Gopalganj Jahanabad Jamui Kaimur Katihar Kishanganj |
|
Lakhisarai Madhepura Muzaffarpur Patna Purnia Saharsa Samastipur Saran Sitamarhi Supaul Vaishali WestChamparan |
|
|
|
5. Chandigarh (1) |
|
Chandigarh |
|
|
|
6. Chhattisgarh (10) |
|
Balrampur Bastar Bilaspur Jashpur Kabirdham Korba Raigarh Rajnandgaon Sarguja Sukma |
|
|
|
7. Delhi (1) |
|
NewDelhi |
|
|
|
8. Goa (1) |
|
NorthSouthGoa |
|
|
|
9. Jharkhand (6) |
|
Deoghar Garhwa Jamtara Palamu Ranchi Sahebganj |
|
|
|
10. Karnataka (13) |
|
Bangalore Belgaum Bellary Bidar Bijapur Chamrajnagar DakshinKannada Dharwad Gulbarga Koppal Mysore Raichur Shimoga |
|
|
|
11. MadhyaPradesh (3) |
|
Bhopal Dhar Katni |
|
|
|
12. Maharashtra (7) |
|
Aurangabad Chandrapur Dhule Nagpur Pune Sindhudurga Solapur |
|
|
|
13. Meghalaya (1) |
|
WestGaroHills |
|
|
|
14. Nagaland (2) |
|
Dimapur Kohima |
|
|
|
15. Odisha (1) |
|
Khordha |
|
|
|
16. Rajasthan (3) |
|
Churu Jaipur Nagaur |
|
|
|
17. TamilNadu (4) |
|
Chennai Kanyakumari Namakkal Nilgiris |
|
|
|
18. Telangana (4) |
|
Hyderabad Karimnagar Mahabubabad Nalgonda |
|
|
|
19. Tripura (3) |
|
Dhalai Unakoti WestTripura |
|
|
|
20. UttarPradesh (13) |
|
Budaun Deoria Etah Ghazipur Gorakhpur Hamirpur Jalaun JyotibaPhuleNagar Lalitpur Lucknow |
|
Muzzaffarnagar Saharanpur Varanasi |
|
|
|
21. Uttarakhand (2) |
|
TehriGarhwal Uttarkashi |
|
|
|
22. WestBengal (11) |
|
Alipurduar CoochBehar DakshinDinajpur Darjeeling Jalpaiguri Jhargram Kolkata Malda |
|
North24Parganas PaschimMedinipur Purulia |
|
""" |
|
|
|
|
|
import os |
|
import subprocess |
|
import numpy as np |
|
from scipy.io.wavfile import write |
|
from tqdm import tqdm, trange |
|
from collections import Counter |
|
import datasets |
|
from datasets import Audio |
|
from datasets import load_dataset |
|
from datasets import get_dataset_config_names |
|
|
|
|
|
def array_to_wav(audio_array: np.ndarray, sample_rate: int, file_name: str): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
write(file_name, sample_rate, audio_array) |
|
|
|
|
|
configs = get_dataset_config_names("ARTPARK-IISc/Vaani") |
|
print(configs, "\n\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Hindi_district = [ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', |
|
'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', |
|
'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', |
|
'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', |
|
'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', |
|
|
|
'Chandigarh_Chandigarh', |
|
|
|
'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Jashpur', |
|
'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Rajnandgaon', |
|
'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', |
|
|
|
|
|
|
|
'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Sahebganj', |
|
|
|
|
|
|
|
|
|
|
|
'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'UttarPradesh_Budaun', 'UttarPradesh_Hamirpur', 'UttarPradesh_JyotibaPhuleNagar', |
|
'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', |
|
|
|
'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', |
|
|
|
'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', |
|
'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', |
|
'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', |
|
'WestBengal_Purulia' |
|
] |
|
|
|
|
|
missing = [i for i in Hindi_district if i not in configs] |
|
|
|
|
|
|
|
hindi_dis_ds = [i for i in configs if i in Hindi_district] |
|
hindi_dis_ds = Hindi_district |
|
print(len(hindi_dis_ds), hindi_dis_ds, "\n\n") |
|
|
|
|
|
HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip() |
|
df_dict = {"audio_path":[], 'referenceImage':[], 'gender':[], 'state':[], 'district':[]} |
|
|
|
import os |
|
os.makedirs("/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache", exist_ok=True) |
|
os.environ["HF_DATASETS_CACHE"] = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache" |
|
|
|
|
|
ref_image_file = "/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/used_reference_images_Hindi-copy_and.txt" |
|
referenceImages = set() |
|
if os.path.exists(ref_image_file): |
|
with open(ref_image_file, "r") as f: |
|
referenceImages = set([line.strip() for line in f if line.strip()]) |
|
|
|
for i, district in enumerate(hindi_dis_ds): |
|
|
|
print("-"*2, i+1, "-"*10, district, "-"*10) |
|
|
|
cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}' |
|
os.makedirs(cache_dir, exist_ok=True) |
|
|
|
ds = load_dataset("ARTPARK-IISc/Vaani", district, split="train", num_proc=20, |
|
token=HF_TOKEN, cache_dir=cache_dir, streaming=False) |
|
|
|
|
|
ds = ds.cast_column("audio", datasets.Audio(decode=False)) |
|
|
|
|
|
Hindi_rows = ds.filter(lambda x: x["language"] == "Hindi") |
|
|
|
|
|
Hindi_rows = Hindi_rows.cast_column("audio", datasets.Audio()) |
|
|
|
save_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/{district}' |
|
os.makedirs(save_dir, exist_ok=True) |
|
pbar = tqdm(total=len(Hindi_rows), dynamic_ncols=True, colour='blue') |
|
|
|
with open(ref_image_file, "a") as ref_out: |
|
for i in Hindi_rows: |
|
ref_img = i['referenceImage'] |
|
if ref_img not in referenceImages: |
|
referenceImages.add(ref_img) |
|
ref_out.write(ref_img + "\n") |
|
df_dict['audio_path'].append(i['audio']['path']) |
|
df_dict['referenceImage'].append(i['referenceImage']) |
|
df_dict['gender'].append(i['gender']) |
|
df_dict['state'].append(i['state']) |
|
df_dict['district'].append(i['district']) |
|
array_to_wav( |
|
i['audio']['array'], |
|
i['audio']['sampling_rate'], |
|
os.path.join(save_dir, i['audio']['path']) |
|
) |
|
pbar.set_description(f"{district} referenceImages {len(referenceImages)}") |
|
pbar.refresh() |
|
|
|
pbar.update(1) |
|
pbar.close() |
|
|
|
subprocess.run(["rm", "-rf", cache_dir], check=True) |
|
subprocess.run(["rm", "-rf", "/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/hub/datasets--ARTPARK-IISc--Vaani"], check=True) |
|
subprocess.run(["rm", "-rf", "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache"], check=True) |
|
subprocess.run(["rm", "-rf", "/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/datasets"], check=True) |
|
subprocess.run(["rm", "-rf", "/scratch/IITB/ai-at-ieor/23m1521/hf_cache/hub/datasets--ARTPARK-IISc--Vaani"], check=True) |
|
os.makedirs("/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/cache", exist_ok=True) |
|
|
|
|
|
|
|
import pandas as pd |
|
|
|
pd.concat([ |
|
pd.read_csv("/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi2.csv"), |
|
pd.DataFrame(df_dict) |
|
], |
|
axis=0, |
|
ignore_index=True |
|
).to_csv( |
|
"Vaani-Audio-Image-Hindi4.csv", |
|
index=False |
|
) |
|
|
|
|