|
|
|
|
|
|
|
|
|
import os |
|
import subprocess |
|
import numpy as np |
|
from scipy.io.wavfile import write |
|
from tqdm import tqdm, trange |
|
from collections import Counter |
|
from datasets import load_dataset |
|
from datasets import get_dataset_config_names |
|
|
|
|
|
def array_to_wav(audio_array: np.ndarray, sample_rate: int, file_name: str): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
write(file_name, sample_rate, audio_array) |
|
|
|
|
|
configs = get_dataset_config_names("ARTPARK-IISc/Vaani") |
|
english_district = [ |
|
'Bellary', 'Bijapur', 'Churu', 'Etah', 'Guntur', 'JyotibaPhuleNagar', |
|
'Karimnagar', 'Krishna', 'NorthSouthGoa', 'Rajnandgaon', 'Sahebganj' |
|
] |
|
eng_dis_ds = [i for i in configs if i.split("_")[-1] in english_district] |
|
print(len(eng_dis_ds), eng_dis_ds, "\n\n") |
|
|
|
|
|
|
|
df_dict = {"audio_path":[], 'referenceImage':[], 'gender':[], 'state':[], 'district':[]} |
|
|
|
|
|
for i, district in enumerate(eng_dis_ds): |
|
|
|
print("-"*2, i+1, "-"*10, district, "-"*10) |
|
|
|
cache_dir = f'/scratch/23m1521/Vaani/Audio-Cache/English/{district}' |
|
os.makedirs(cache_dir, exist_ok=True) |
|
|
|
ds = load_dataset("ARTPARK-IISc/Vaani", district, split="train", num_proc=10, |
|
token=HF_TOKEN, cache_dir=cache_dir, streaming=False) |
|
english_rows = ds.filter(lambda x: x["language"] == "English") |
|
|
|
save_dir = f'/scratch/23m1521/Vaani/Audio/English/{district}' |
|
os.makedirs(save_dir, exist_ok=True) |
|
for i in tqdm(english_rows, dynamic_ncols=True, colour='blue'): |
|
|
|
|
|
array_to_wav( |
|
i['audio']['array'], |
|
i['audio']['sampling_rate'], |
|
os.path.join(save_dir, i['audio']['path']) |
|
) |
|
df_dict['audio_path'].append(i['audio']['path']) |
|
df_dict['referenceImage'].append(i['referenceImage']) |
|
df_dict['gender'].append(i['gender']) |
|
df_dict['state'].append(i['state']) |
|
df_dict['district'].append(i['district']) |
|
|
|
subprocess.run(["rm", "-rf", cache_dir], check=True) |
|
subprocess.run(["rm", "-rf", "/home/23m1521/.cache/huggingface/hub"], check=True) |
|
|
|
|
|
import pandas as pd |
|
pd.DataFrame(df_dict).to_csv("Vaani-Audio-Image-English.csv", index=False) |