# import os | |
# from huggingface_hub import login | |
# import polars as pl | |
# from tqdm import tqdm | |
# from datasets import get_dataset_config_names | |
# import pandas as pd | |
# from joblib import Parallel, delayed | |
# # Authenticate HuggingFace | |
# HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip() | |
# os.environ["HF_TOKEN"] = HF_TOKEN | |
# login(token=HF_TOKEN) | |
# # Get dataset configs | |
# configs = get_dataset_config_names("ARTPARK-IISc/Vaani") | |
# print(configs, "\n\n") | |
# # Prepare URLs and metadata | |
# urls_dict = {'state': [], 'district': [], 'url': []} | |
# for i in configs: | |
# state = i.split('_')[0] | |
# district = i.split('_')[-1] | |
# urls_dict['state'].append(state) | |
# urls_dict['district'].append(district) | |
# urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet") | |
# urls_df = pd.DataFrame(urls_dict) | |
# urls_df = urls_df.iloc[11:,:] | |
# # Output directory | |
# savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData" | |
# os.makedirs(savedir, exist_ok=True) | |
# # Function to process each config | |
# def make_filter_df(row): | |
# state = row['state'] | |
# district = row['district'] | |
# url = row['url'] | |
# try: | |
# df = pl.scan_parquet(url).with_columns( | |
# pl.col('audio').struct.field('path').alias('audio_path') | |
# ).drop('audio').collect() | |
# df.write_parquet(f"{savedir}/{state}_{district}_meta.parquet", compression="gzip") | |
# print(f"β Saved {state}_{district}_meta.parquet") | |
# except Exception as e: | |
# print(f"β Error processing {state}_{district}: {e}") | |
# # Parallel processing using joblib | |
# Parallel(n_jobs=20, backend="loky")( | |
# delayed(make_filter_df)(row) for _, row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70) | |
# ) | |
################################################################################################################################## | |
# import os | |
# from huggingface_hub import login | |
# import polars as pl | |
# from tqdm import tqdm, trange | |
# from datasets import get_dataset_config_names | |
# import pandas as pd | |
# # HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip() | |
# # os.environ["HF_TOKEN"] = HF_TOKEN | |
# # login(token=HF_TOKEN) | |
# # configs = get_dataset_config_names("ARTPARK-IISc/Vaani") | |
# configs = ['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images'] | |
# print(configs, "\n\n") | |
# urls_dict = {'state': [], 'district': [], 'url': [], 'config_name':[]} | |
# for i in configs: | |
# state = i.split('_')[0] | |
# district = i.split('_')[-1] | |
# urls_dict['state'].append(state) | |
# urls_dict['district'].append(district) | |
# urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet") | |
# urls_dict['config_name'].append(i) | |
# urls_df = pd.DataFrame(urls_dict) | |
# # def make_filter_df(state, district, url, savedir): | |
# # df = pl.scan_parquet(url).with_columns( | |
# # pl.col('audio').struct.field('path').alias('audio_path') | |
# # ).drop('audio').collect( | |
# # ).write_parquet( | |
# # f"{savedir}/{state}_{district}_meta.parquet", | |
# # compression="gzip" | |
# # ) | |
# # print(f"β Saved {state}_{district}_meta.parquet") | |
# # def make_filter_df_eager(state, district, url, savedir): | |
# # df = pl.read_parquet(url).with_columns( | |
# # pl.col('audio').struct.field('path').alias('audio_path') | |
# # ).drop('audio') | |
# # output_path = f"{savedir}/{state}_{district}_meta.parquet" | |
# # df.write_parquet(output_path, compression="gzip") | |
# # print(f"β Saved {state}_{district}_meta.parquet") | |
# savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData" | |
# os.makedirs(savedir, exist_ok=True) | |
# done = ["_".join(i.split(".")[:-1][0].split("_")[:-1]) for i in sorted(os.listdir(savedir))] | |
# done | |
# for row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70): | |
# state = row[1]['state'] | |
# district = row[1]['district'] | |
# url = row[1]['url'] | |
# config_name = row[1]['config_name'] | |
# if config_name in done: | |
# print("β οΈ Aleardy Done ", state, district, url) | |
# else: | |
# print("π Running ", state, district, url) | |
# make_filter_df_eager(state, district, url, savedir) | |
# done.append(config_name) | |
################################################################################################################################################################################## | |
# import os | |
# from huggingface_hub import login | |
# import polars as pl | |
# from tqdm import tqdm, trange | |
# from datasets import get_dataset_config_names, load_dataset, Audio | |
# import pandas as pd | |
# import subprocess | |
# HF_TOKEN = open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r").read().strip() | |
# configs = ['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images'] | |
# print(configs, "\n\n") | |
# urls_dict = {'state': [], 'district': [], 'url': [], 'config_name':[]} | |
# for i in configs: | |
# state = i.split('_')[0] | |
# district = i.split('_')[-1] | |
# urls_dict['state'].append(state) | |
# urls_dict['district'].append(district) | |
# urls_dict['url'].append(f"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet") | |
# urls_dict['config_name'].append(i) | |
# urls_df = pd.DataFrame(urls_dict) | |
# def make_filter_df_from_hf(state, district, cache_dir, savedir, hf_token=HF_TOKEN): | |
# ds = load_dataset( | |
# "ARTPARK-IISc/Vaani", | |
# district, | |
# split="train", | |
# num_proc=20, | |
# token=hf_token, | |
# cache_dir=cache_dir, | |
# streaming=False | |
# ) | |
# ds = ds.cast_column("audio", Audio(decode=False)) | |
# audio_paths = [] | |
# columns = {col: [] for col in ds.column_names if col != "audio"} | |
# for row in tqdm(ds, desc=f"Processing {state}_{district}"): | |
# audio_paths.append(row["audio"]["path"]) | |
# for col in columns: | |
# columns[col].append(row[col]) | |
# df = pl.DataFrame({ | |
# "audio_path": audio_paths, | |
# **columns | |
# }) | |
# os.makedirs(savedir, exist_ok=True) | |
# out_path = os.path.join(savedir, f"{state}_{district}_meta.parquet") | |
# df.write_parquet(out_path, compression="gzip") | |
# print(f"β Saved {state}_{district}_meta.parquet to {out_path}") | |
# savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData" | |
# os.makedirs(savedir, exist_ok=True) | |
# done = ["_".join(i.split(".")[:-1][0].split("_")[:-1]) for i in sorted(os.listdir(savedir))] | |
# done | |
# for row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70): | |
# state = row[1]['state'] | |
# district = row[1]['district'] | |
# url = row[1]['url'] | |
# config_name = row[1]['config_name'] | |
# cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}' | |
# if config_name in done: | |
# print("β οΈ Aleardy Done ", state, district, url) | |
# else: | |
# print("π Running ", state, district, url) | |
# make_filter_df_from_hf(state, district, cache_dir, savedir) | |
# subprocess.run(["rm", "-rf", cache_dir], check=True) | |
# done.append(config_name) | |
################################################################################################################################################################################## | |
from joblib import Parallel, delayed | |
import os | |
from huggingface_hub import login | |
import polars as pl | |
from tqdm import tqdm | |
from datasets import load_dataset, Audio | |
import pandas as pd | |
from time import sleep | |
import multiprocessing | |
import subprocess | |
# Increase timeout for HF downloads | |
os.environ["HF_HUB_TIMEOUT"] = "120" # seconds | |
multiprocessing.set_start_method("fork", force=True) | |
# Token setup | |
with open("/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token", "r") as f: | |
HF_TOKEN = f.read().strip() | |
# Cache and save directories | |
savedir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData" | |
os.makedirs(savedir, exist_ok=True) | |
# Dataset configs (add all desired configs here) | |
configs = [ | |
# Andhra Pradesh | |
'AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', | |
'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', | |
# Arunachal Pradesh | |
'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', | |
# Assam | |
'Assam_KamrupMetropolitan', 'Assam_Sonitpur', | |
# Bihar | |
'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', | |
'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', | |
'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', | |
'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', | |
'Bihar_Vaishali', 'Bihar_WestChamparan', | |
# Chandigarh | |
'Chandigarh_Chandigarh', | |
# Chhattisgarh | |
'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', | |
'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', | |
'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', | |
# Delhi | |
'Delhi_NewDelhi', | |
# Goa | |
'Goa_NorthSouthGoa', | |
# Jharkhand | |
'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', | |
'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', | |
# Karnataka | |
'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', | |
'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', | |
'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', | |
# Madhya Pradesh | |
'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', | |
# Maharashtra | |
'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', | |
'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', | |
# Meghalaya | |
'Meghalaya_WestGaroHills', | |
# Nagaland | |
'Nagaland_Dimapur', 'Nagaland_Kohima', | |
# Odisha | |
'Odisha_Khordha', | |
# Rajasthan | |
'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', | |
# Tamil Nadu | |
'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', | |
# Telangana | |
'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', | |
# Tripura | |
'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', | |
# Uttar Pradesh | |
'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', | |
'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', | |
'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', | |
'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', | |
# Uttarakhand | |
'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', | |
# West Bengal | |
'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', | |
'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', | |
'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia' | |
] | |
# Build DataFrame for configs | |
urls_dict = {'state': [], 'district': [], 'config_name': []} | |
for config in configs: | |
state, district = config.split('_') | |
urls_dict['state'].append(state) | |
urls_dict['district'].append(district) | |
urls_dict['config_name'].append(config) | |
urls_df = pd.DataFrame(urls_dict) | |
# Get already processed configs | |
done = ["_".join(fname.split(".")[0].split("_")[:-1]) for fname in os.listdir(savedir) if fname.endswith(".parquet")] | |
urls_df = urls_df[~urls_df['config_name'].isin(done)] | |
# Function to process each config | |
def make_filter_df_from_hf(state, district, config_district, savedir, hf_token=HF_TOKEN, retries=3): | |
cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}' | |
os.makedirs(cache_dir, exist_ok=True) | |
print('\n\n', "-"*10, config_district, "-"*10) | |
# for attempt in range(retries): | |
attempt = 0 | |
while True: | |
try: | |
ds = load_dataset( | |
"ARTPARK-IISc/Vaani", | |
config_district, | |
split="train", | |
num_proc=48, | |
token=hf_token, | |
cache_dir=cache_dir, | |
streaming=False | |
) | |
ds = ds.cast_column("audio", Audio(decode=False)) | |
audio_paths = [] | |
columns = {col: [] for col in ds.column_names if col != "audio"} | |
for row in tqdm(ds, desc=f"Processing {state}_{district}"): | |
audio_paths.append(row["audio"]["path"]) | |
for col in columns: | |
columns[col].append(row[col]) | |
df = pl.DataFrame({"audio_path": audio_paths, **columns}) | |
os.makedirs(savedir, exist_ok=True) | |
out_path = os.path.join(savedir, f"{state}_{district}_meta.parquet") | |
df.write_parquet(out_path, compression="gzip") | |
subprocess.run(["rm", "-rf", cache_dir, '/scratch/IITB/ai-at-ieor/23m1521/hf_cache/hub/datasets--ARTPARK-IISc--Vaani/blobs/'], check=True) | |
print(f"β Saved {state}_{district}_meta.parquet to {out_path}") | |
break | |
except Exception as e: | |
attempt += 1 | |
print(f"β Failed {config_district} on attempt {attempt} with error: {e}") | |
sleep(5) | |
# if attempt < retries - 1: | |
# else: | |
# print(f"π¨ Giving up on {state}_{district} after {retries} attempts") | |
# Parallel execution | |
Parallel(n_jobs=1, backend="loky")( | |
delayed(make_filter_df_from_hf)( | |
row['state'], row['district'], row['config_name'], savedir | |
) | |
for _, row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='green', ncols=70) | |
) | |