# import os # from huggingface_hub import login # import polars as pl # from tqdm import tqdm # from datasets import get_dataset_config_names # import pandas as pd # from joblib import Parallel, delayed # # Authenticate HuggingFace # HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip() # os.environ["HF_TOKEN"] = HF_TOKEN # login(token=HF_TOKEN) # # Get dataset configs # configs = get_dataset_config_names("ARTPARK-IISc/Vaani") # print(configs, "\n\n") # # Prepare URLs and metadata # urls_dict = {'state': [], 'district': [], 'url': []} # for i in configs: # state = i.split('_')[0] # district = i.split('_')[-1] # urls_dict['state'].append(state) # urls_dict['district'].append(district) # urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet") # urls_df = pd.DataFrame(urls_dict) # urls_df = urls_df.iloc[11:,:] # # Output directory # savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData" # os.makedirs(savedir, exist_ok=True) # # Function to process each config # def make_filter_df(row): # state = row['state'] # district = row['district'] # url = row['url'] # try: # df = pl.scan_parquet(url).with_columns( # pl.col('audio').struct.field('path').alias('audio_path') # ).drop('audio').collect() # df.write_parquet(f"{savedir}/{state}_{district}_meta.parquet", compression="gzip") # print(f"✅ Saved {state}_{district}_meta.parquet") # except Exception as e: # print(f"❌ Error processing {state}_{district}: {e}") # # Parallel processing using joblib # Parallel(n_jobs=20, backend="loky")( # delayed(make_filter_df)(row) for _, row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70) # ) ################################################################################################################################## # import os # from huggingface_hub import login # import polars as pl # from tqdm import tqdm, trange # from datasets import get_dataset_config_names # import pandas as pd # # HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip() # # os.environ["HF_TOKEN"] = HF_TOKEN # # login(token=HF_TOKEN) # # configs = get_dataset_config_names("ARTPARK-IISc/Vaani") # configs = ['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images'] # print(configs, "\n\n") # urls_dict = {'state': [], 'district': [], 'url': [], 'config_name':[]} # for i in configs: # state = i.split('_')[0] # district = i.split('_')[-1] # urls_dict['state'].append(state) # urls_dict['district'].append(district) # urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet") # urls_dict['config_name'].append(i) # urls_df = pd.DataFrame(urls_dict) # # def make_filter_df(state, district, url, savedir): # # df = pl.scan_parquet(url).with_columns( # # pl.col('audio').struct.field('path').alias('audio_path') # # ).drop('audio').collect( # # ).write_parquet( # # f"{savedir}/{state}_{district}_meta.parquet", # # compression="gzip" # # ) # # print(f"✅ Saved {state}_{district}_meta.parquet") # # def make_filter_df_eager(state, district, url, savedir): # # df = pl.read_parquet(url).with_columns( # # pl.col('audio').struct.field('path').alias('audio_path') # # ).drop('audio') # # output_path = f"{savedir}/{state}_{district}_meta.parquet" # # df.write_parquet(output_path, compression="gzip") # # print(f"✅ Saved {state}_{district}_meta.parquet") # savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData" # os.makedirs(savedir, exist_ok=True) # done = ["_".join(i.split(".")[:-1][0].split("_")[:-1]) for i in sorted(os.listdir(savedir))] # done # for row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70): # state = row[1]['state'] # district = row[1]['district'] # url = row[1]['url'] # config_name = row[1]['config_name'] # if config_name in done: # print("⚠️ Aleardy Done ", state, district, url) # else: # print("🏃 Running ", state, district, url) # make_filter_df_eager(state, district, url, savedir) # done.append(config_name) ################################################################################################################################################################################## # import os # from huggingface_hub import login # import polars as pl # from tqdm import tqdm, trange # from datasets import get_dataset_config_names, load_dataset, Audio # import pandas as pd # import subprocess # HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip() # configs = ['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images'] # print(configs, "\n\n") # urls_dict = {'state': [], 'district': [], 'url': [], 'config_name':[]} # for i in configs: # state = i.split('_')[0] # district = i.split('_')[-1] # urls_dict['state'].append(state) # urls_dict['district'].append(district) # urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet") # urls_dict['config_name'].append(i) # urls_df = pd.DataFrame(urls_dict) # def make_filter_df_from_hf(state, district, cache_dir, savedir, hf_token=HF_TOKEN): # ds = load_dataset( # "ARTPARK-IISc/Vaani", # district, # split="train", # num_proc=20, # token=hf_token, # cache_dir=cache_dir, # streaming=False # ) # ds = ds.cast_column("audio", Audio(decode=False)) # audio_paths = [] # columns = {col: [] for col in ds.column_names if col != "audio"} # for row in tqdm(ds, desc=f"Processing {state}_{district}"): # audio_paths.append(row["audio"]["path"]) # for col in columns: # columns[col].append(row[col]) # df = pl.DataFrame({ # "audio_path": audio_paths, # **columns # }) # os.makedirs(savedir, exist_ok=True) # out_path = os.path.join(savedir, f"{state}_{district}_meta.parquet") # df.write_parquet(out_path, compression="gzip") # print(f"✅ Saved {state}_{district}_meta.parquet to {out_path}") # savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData" # os.makedirs(savedir, exist_ok=True) # done = ["_".join(i.split(".")[:-1][0].split("_")[:-1]) for i in sorted(os.listdir(savedir))] # done # for row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70): # state = row[1]['state'] # district = row[1]['district'] # url = row[1]['url'] # config_name = row[1]['config_name'] # cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}' # if config_name in done: # print("⚠️ Aleardy Done ", state, district, url) # else: # print("🏃 Running ", state, district, url) # make_filter_df_from_hf(state, district, cache_dir, savedir) # subprocess.run(["rm", "-rf", cache_dir], check=True) # done.append(config_name) ################################################################################################################################################################################## from joblib import Parallel, delayed import os from huggingface_hub import login import polars as pl from tqdm import tqdm from datasets import load_dataset, Audio import pandas as pd from time import sleep import multiprocessing import subprocess # Increase timeout for HF downloads os.environ["HF_HUB_TIMEOUT"] = "120" # seconds multiprocessing.set_start_method("fork", force=True) # Token setup with open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r") as f: HF_TOKEN = f.read().strip() # Cache and save directories savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData" os.makedirs(savedir, exist_ok=True) # Dataset configs (add all desired configs here) configs = [ # Andhra Pradesh 'AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', # Arunachal Pradesh 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', # Assam 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', # Bihar 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', # Chandigarh 'Chandigarh_Chandigarh', # Chhattisgarh 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', # Delhi 'Delhi_NewDelhi', # Goa 'Goa_NorthSouthGoa', # Jharkhand 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', # Karnataka 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', # Madhya Pradesh 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', # Maharashtra 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', # Meghalaya 'Meghalaya_WestGaroHills', # Nagaland 'Nagaland_Dimapur', 'Nagaland_Kohima', # Odisha 'Odisha_Khordha', # Rajasthan 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', # Tamil Nadu 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', # Telangana 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', # Tripura 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', # Uttar Pradesh 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', # Uttarakhand 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', # West Bengal 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia' ] # Build DataFrame for configs urls_dict = {'state': [], 'district': [], 'config_name': []} for config in configs: state, district = config.split('_') urls_dict['state'].append(state) urls_dict['district'].append(district) urls_dict['config_name'].append(config) urls_df = pd.DataFrame(urls_dict) # Get already processed configs done = ["_".join(fname.split(".")[0].split("_")[:-1]) for fname in os.listdir(savedir) if fname.endswith(".parquet")] urls_df = urls_df[~urls_df['config_name'].isin(done)] # Function to process each config def make_filter_df_from_hf(state, district, config_district, savedir, hf_token=HF_TOKEN, retries=3): cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}' os.makedirs(cache_dir, exist_ok=True) print('\n\n', "-"*10, config_district, "-"*10) # for attempt in range(retries): attempt = 0 while True: try: ds = load_dataset( "ARTPARK-IISc/Vaani", config_district, split="train", num_proc=48, token=hf_token, cache_dir=cache_dir, streaming=False ) ds = ds.cast_column("audio", Audio(decode=False)) audio_paths = [] columns = {col: [] for col in ds.column_names if col != "audio"} for row in tqdm(ds, desc=f"Processing {state}_{district}"): audio_paths.append(row["audio"]["path"]) for col in columns: columns[col].append(row[col]) df = pl.DataFrame({"audio_path": audio_paths, **columns}) os.makedirs(savedir, exist_ok=True) out_path = os.path.join(savedir, f"{state}_{district}_meta.parquet") df.write_parquet(out_path, compression="gzip") subprocess.run(["rm", "-rf", cache_dir, '/scratch/IITB/ai-at-ieor/23m1521/hf_cache/hub/datasets--ARTPARK-IISc--Vaani/blobs/'], check=True) print(f"✅ Saved {state}_{district}_meta.parquet to {out_path}") break except Exception as e: attempt += 1 print(f"❌ Failed {config_district} on attempt {attempt} with error: {e}") sleep(5) # if attempt < retries - 1: # else: # print(f"🚨 Giving up on {state}_{district} after {retries} attempts") # Parallel execution Parallel(n_jobs=1, backend="loky")( delayed(make_filter_df_from_hf)( row['state'], row['district'], row['config_name'], savedir ) for _, row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='green', ncols=70) )