# !pip install huggingface_hub # !pip install datasets import os import subprocess import numpy as np from scipy.io.wavfile import write # type: ignore from tqdm import tqdm, trange from collections import Counter from datasets import load_dataset from datasets import get_dataset_config_names def array_to_wav(audio_array: np.ndarray, sample_rate: int, file_name: str): # Normalize if outside [-1, 1] to prevent clipping # if np.max(np.abs(audio_array)) > 1.0: # audio_array = audio_array / np.max(np.abs(audio_array)) # Convert to 16-bit PCM # audio_int16 = np.int16(audio_array * 32767) # Write to WAV file # write(file_name, sample_rate, audio_int16) write(file_name, sample_rate, audio_array) configs = get_dataset_config_names("ARTPARK-IISc/Vaani") english_district = [ 'Bellary', 'Bijapur', 'Churu', 'Etah', 'Guntur', 'JyotibaPhuleNagar', 'Karimnagar', 'Krishna', 'NorthSouthGoa', 'Rajnandgaon', 'Sahebganj' ] eng_dis_ds = [i for i in configs if i.split("_")[-1] in english_district] print(len(eng_dis_ds), eng_dis_ds, "\n\n") # HF_TOKEN = df_dict = {"audio_path":[], 'referenceImage':[], 'gender':[], 'state':[], 'district':[]} for i, district in enumerate(eng_dis_ds): # district = "AndhraPradesh_Krishna" print("-"*2, i+1, "-"*10, district, "-"*10) cache_dir = f'/scratch/23m1521/Vaani/Audio-Cache/English/{district}' os.makedirs(cache_dir, exist_ok=True) ds = load_dataset("ARTPARK-IISc/Vaani", district, split="train", num_proc=10, token=HF_TOKEN, cache_dir=cache_dir, streaming=False) english_rows = ds.filter(lambda x: x["language"] == "English") save_dir = f'/scratch/23m1521/Vaani/Audio/English/{district}' os.makedirs(save_dir, exist_ok=True) for i in tqdm(english_rows, dynamic_ncols=True, colour='blue'): # for i in ds: # if i["language"] == 'English': array_to_wav( i['audio']['array'], i['audio']['sampling_rate'], os.path.join(save_dir, i['audio']['path']) ) df_dict['audio_path'].append(i['audio']['path']) df_dict['referenceImage'].append(i['referenceImage']) df_dict['gender'].append(i['gender']) df_dict['state'].append(i['state']) df_dict['district'].append(i['district']) subprocess.run(["rm", "-rf", cache_dir], check=True) subprocess.run(["rm", "-rf", "/home/23m1521/.cache/huggingface/hub"], check=True) # break import pandas as pd pd.DataFrame(df_dict).to_csv("Vaani-Audio-Image-English.csv", index=False)