{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "3b7de7a5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n"
]
}
],
"source": [
"import os\n",
"from huggingface_hub import login\n",
"\n",
"HF_TOKEN = open(\"/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token\", \"r\").read().strip()\n",
"os.environ[\"HF_TOKEN\"] = HF_TOKEN\n",
"login(token=HF_TOKEN)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "943e48ba",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)\n",
"
\n",
" Parquet SCAN [https://huggingface.co/datasets/ARTPARK-IISc%2FVaani/resolve/main/audio%2FAndhraPradesh%2FAnantpur%2Ftrain-00000-of-00057.parquet, ... 56 other sources] [id: 140121137377680]
PROJECT */11 COLUMNS
"
],
"text/plain": [
""
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import polars as pl\n",
"\n",
"df2 = pl.scan_parquet('hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Anantpur/train-*.parquet')\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "190bd8f9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)\n",
" \n",
" simple π 11/11 [\"language\", \"languagesKnown\", ... 9 other columns]
WITH_COLUMNS:
[col(\"audio\").struct.field_by_name(path)().alias(\"audio_path\")]
Parquet SCAN [https://huggingface.co/datasets/ARTPARK-IISc%2FVaani/resolve/main/audio%2FAndhraPradesh%2FAnantpur%2Ftrain-00000-of-00057.parquet, ... 56 other sources] [id: 140121137377680]
PROJECT */11 COLUMNS
"
],
"text/plain": [
""
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df2.with_columns(\n",
"\tpl.col('audio').struct.field('path').alias('audio_path')\n",
").drop('audio')\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1f881998",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2159560/3936327964.py:1: PerformanceWarning: Determining the column names of a LazyFrame requires resolving its schema, which is a potentially expensive operation. Use `LazyFrame.collect_schema().names()` to get the column names without this warning.\n",
" df2.columns\n"
]
},
{
"data": {
"text/plain": [
"['language',\n",
" 'languagesKnown',\n",
" 'gender',\n",
" 'state',\n",
" 'district',\n",
" 'pincode',\n",
" 'stay(years)',\n",
" 'isTranscriptionAvailable',\n",
" 'transcript',\n",
" 'referenceImage',\n",
" 'audio_path']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9b32a79a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
shape: (2, 1)isTranscriptionAvailable |
---|
struct[2] |
{"No",113132} |
{"Yes",5420} |
"
],
"text/plain": [
"shape: (2, 1)\n",
"┌──────────────────────────┐\n",
"│ isTranscriptionAvailable │\n",
"│ --- │\n",
"│ struct[2] │\n",
"╞══════════════════════════╡\n",
"│ {\"No\",113132} │\n",
"│ {\"Yes\",5420} │\n",
"└──────────────────────────┘"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.select(pl.col('isTranscriptionAvailable').value_counts()).collect()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "035af4ce",
"metadata": {},
"outputs": [],
"source": [
"df2.collect().write_parquet(\"AndhraPradesh_Anantpur_meta.parquet\", compression=\"gzip\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d87a8f25",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"\n",
"df = pl.read_parquet('hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Anantpur/train-*.parquet')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e055da50",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n"
]
},
{
"data": {
"text/html": [
"Dask DataFrame Structure:
\n",
"\n",
" \n",
" \n",
" | \n",
" audio | \n",
" language | \n",
" languagesKnown | \n",
" gender | \n",
" state | \n",
" district | \n",
" pincode | \n",
" stay(years) | \n",
" isTranscriptionAvailable | \n",
" transcript | \n",
" referenceImage | \n",
"
\n",
" \n",
" npartitions=54 | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | \n",
" object | \n",
" string | \n",
" string | \n",
" string | \n",
" string | \n",
" string | \n",
" int64 | \n",
" string | \n",
" string | \n",
" string | \n",
" string | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
"
\n",
"Dask Name: read_parquet, 1 expression
"
],
"text/plain": [
"Dask DataFrame Structure:\n",
" audio language languagesKnown gender state district pincode stay(years) isTranscriptionAvailable transcript referenceImage\n",
"npartitions=54 \n",
" object string string string string string int64 string string string string\n",
" ... ... ... ... ... ... ... ... ... ... ...\n",
"... ... ... ... ... ... ... ... ... ... ... ...\n",
" ... ... ... ... ... ... ... ... ... ... ...\n",
" ... ... ... ... ... ... ... ... ... ... ...\n",
"Dask Name: read_parquet, 1 expression\n",
"Expr=ReadParquetFSSpec(446f636)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"from huggingface_hub import login\n",
"\n",
"HF_TOKEN = open(\"/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token\", \"r\").read().strip()\n",
"os.environ[\"HF_TOKEN\"] = HF_TOKEN\n",
"login(token=HF_TOKEN)\n",
"\n",
"import dask.dataframe as dd\n",
"\n",
"df = dd.read_parquet(\"hf://datasets/ARTPARK-IISc/Vaani/audio/Delhi/NewDelhi/train-*.parquet\")\n",
"df "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "d13cc82c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Dask DataFrame Structure:
\n",
"\n",
" \n",
" \n",
" | \n",
" language | \n",
" languagesKnown | \n",
" gender | \n",
" state | \n",
" district | \n",
" pincode | \n",
" stay(years) | \n",
" isTranscriptionAvailable | \n",
" transcript | \n",
" referenceImage | \n",
" audio_path | \n",
"
\n",
" \n",
" npartitions=54 | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | \n",
" string | \n",
" string | \n",
" string | \n",
" string | \n",
" string | \n",
" int64 | \n",
" string | \n",
" string | \n",
" string | \n",
" string | \n",
" object | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
"
\n",
"Dask Name: drop_by_shallow_copy, 5 expressions
"
],
"text/plain": [
"Dask DataFrame Structure:\n",
" language languagesKnown gender state district pincode stay(years) isTranscriptionAvailable transcript referenceImage audio_path\n",
"npartitions=54 \n",
" string string string string string int64 string string string string object\n",
" ... ... ... ... ... ... ... ... ... ... ...\n",
"... ... ... ... ... ... ... ... ... ... ... ...\n",
" ... ... ... ... ... ... ... ... ... ... ...\n",
" ... ... ... ... ... ... ... ... ... ... ...\n",
"Dask Name: drop_by_shallow_copy, 5 expressions\n",
"Expr=Drop(frame=Assign(frame=ReadParquetFSSpec(446f636)), columns='audio')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['audio_path'] = df['audio'].map(lambda x: x.get('path') if isinstance(x, dict) else None, meta=('audio_path', 'object'))\n",
"\n",
"# Drop the original 'audio' column\n",
"df = df.drop(columns='audio')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "bb0dde46",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"'(ReadTimeoutError(\"HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: e0ac54e2-ea47-4a2a-80a4-c54485398fba)')' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00028-of-00054.parquet\n",
"Retrying in 1s [Retry 1/5].\n",
"'(ReadTimeoutError(\"HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: f7da590c-34ef-45f7-a824-ceca80b768ff)')' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00043-of-00054.parquet\n",
"Retrying in 1s [Retry 1/5].\n",
"'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00003-of-00054.parquet\n",
"Retrying in 1s [Retry 1/5].\n",
"'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00044-of-00054.parquet\n",
"Retrying in 1s [Retry 1/5].\n",
"'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00043-of-00054.parquet\n",
"Retrying in 2s [Retry 2/5].\n",
"'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00017-of-00054.parquet\n",
"Retrying in 1s [Retry 1/5].\n",
"'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00003-of-00054.parquet\n",
"Retrying in 1s [Retry 1/5].\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 3\u001b[39m district = \u001b[33m'\u001b[39m\u001b[33mNewDelhi\u001b[39m\u001b[33m'\u001b[39m\n\u001b[32m 4\u001b[39m output_path = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msavedir\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstate\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdistrict\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_meta.parquet\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mgzip\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mpyarrow\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwrite_index\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/site-packages/dask/dataframe/dask_expr/_collection.py:3314\u001b[39m, in \u001b[36mDataFrame.to_parquet\u001b[39m\u001b[34m(self, path, **kwargs)\u001b[39m\n\u001b[32m 3311\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mto_parquet\u001b[39m(\u001b[38;5;28mself\u001b[39m, path, **kwargs):\n\u001b[32m 3312\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdask\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdataframe\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdask_expr\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mio\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mparquet\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m to_parquet\n\u001b[32m-> \u001b[39m\u001b[32m3314\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/site-packages/dask/dataframe/dask_expr/io/parquet.py:661\u001b[39m, in \u001b[36mto_parquet\u001b[39m\u001b[34m(df, path, compression, write_index, append, overwrite, ignore_divisions, partition_on, storage_options, custom_metadata, write_metadata_file, compute, compute_kwargs, schema, name_function, filesystem, engine, **kwargs)\u001b[39m\n\u001b[32m 637\u001b[39m out = new_collection(\n\u001b[32m 638\u001b[39m ToParquet(\n\u001b[32m 639\u001b[39m df,\n\u001b[32m (...)\u001b[39m\u001b[32m 657\u001b[39m )\n\u001b[32m 658\u001b[39m )\n\u001b[32m 660\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m compute:\n\u001b[32m--> \u001b[39m\u001b[32m661\u001b[39m out = \u001b[43mout\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mcompute_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 663\u001b[39m \u001b[38;5;66;03m# Invalidate the filesystem listing cache for the output path after write.\u001b[39;00m\n\u001b[32m 664\u001b[39m \u001b[38;5;66;03m# We do this before returning, even if `compute=False`. This helps ensure\u001b[39;00m\n\u001b[32m 665\u001b[39m \u001b[38;5;66;03m# that reading files that were just written succeeds.\u001b[39;00m\n\u001b[32m 666\u001b[39m fs.invalidate_cache(path)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/site-packages/dask/base.py:373\u001b[39m, in \u001b[36mDaskMethodsMixin.compute\u001b[39m\u001b[34m(self, **kwargs)\u001b[39m\n\u001b[32m 349\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute\u001b[39m(\u001b[38;5;28mself\u001b[39m, **kwargs):\n\u001b[32m 350\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Compute this dask collection\u001b[39;00m\n\u001b[32m 351\u001b[39m \n\u001b[32m 352\u001b[39m \u001b[33;03m This turns a lazy Dask collection into its in-memory equivalent.\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 371\u001b[39m \u001b[33;03m dask.compute\u001b[39;00m\n\u001b[32m 372\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m373\u001b[39m (result,) = \u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraverse\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 374\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/site-packages/dask/base.py:681\u001b[39m, in \u001b[36mcompute\u001b[39m\u001b[34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[39m\n\u001b[32m 678\u001b[39m expr = expr.optimize()\n\u001b[32m 679\u001b[39m keys = \u001b[38;5;28mlist\u001b[39m(flatten(expr.__dask_keys__()))\n\u001b[32m--> \u001b[39m\u001b[32m681\u001b[39m results = \u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 683\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m repack(results)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/queue.py:202\u001b[39m, in \u001b[36mQueue.get\u001b[39m\u001b[34m(self, block, timeout)\u001b[39m\n\u001b[32m 200\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 201\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._qsize():\n\u001b[32m--> \u001b[39m\u001b[32m202\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnot_empty\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 203\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.is_shutdown \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._qsize():\n\u001b[32m 204\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ShutDown\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/threading.py:359\u001b[39m, in \u001b[36mCondition.wait\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m 357\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[32m 358\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m359\u001b[39m \u001b[43mwaiter\u001b[49m\u001b[43m.\u001b[49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 360\u001b[39m gotit = \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[32m 361\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
"\u001b[31mKeyboardInterrupt\u001b[39m: "
]
}
],
"source": [
"savedir = \"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\"\n",
"state = 'Delhi'\n",
"district = 'NewDelhi'\n",
"output_path = f\"{savedir}/{state}_{district}_meta.parquet\"\n",
"df.to_parquet(output_path, compression='gzip', engine='pyarrow', write_index=False)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "980fc707",
"metadata": {},
"outputs": [],
"source": [
"df = df.with_columns(df['audio'].struct.field('path').alias('audio_path')).drop('audio')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "5fb10cde",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
shape: (118_552, 11)language | languagesKnown | gender | state | district | pincode | stay(years) | isTranscriptionAvailable | transcript | referenceImage | audio_path |
---|
str | str | str | str | str | i64 | str | str | str | str | str |
"Telugu" | "['Telugu', 'English']" | "Female" | "AndhraPradesh" | "Anantpur" | 515671 | "NA(18)" | "No" | null | "Images/IISc_VaaniProject_Anant… | "IISc_VaaniProject_M_AP_Anantpu… |
"Telugu" | "['Telugu', 'English']" | "Female" | "AndhraPradesh" | "Anantpur" | 515671 | "NA(18)" | "No" | null | "Images/IISc_VaaniProject_GENER… | "IISc_VaaniProject_M_AP_Anantpu… |
"Telugu" | "['Telugu', 'English']" | "Female" | "AndhraPradesh" | "Anantpur" | 515671 | "NA(18)" | "No" | null | "Images/IISc_VaaniProject_GENER… | "IISc_VaaniProject_M_AP_Anantpu… |
"Telugu" | "['Telugu', 'English']" | "Female" | "AndhraPradesh" | "Anantpur" | 515671 | "NA(18)" | "No" | null | "Images/IISc_VaaniProject_GENER… | "IISc_VaaniProject_M_AP_Anantpu… |
"Telugu" | "['Telugu', 'English']" | "Female" | "AndhraPradesh" | "Anantpur" | 515671 | "NA(18)" | "No" | null | "Images/IISc_VaaniProject_GENER… | "IISc_VaaniProject_M_AP_Anantpu… |
… | … | … | … | … | … | … | … | … | … | … |
"Hindi" | "['English', 'Hindi']" | "Female" | "AndhraPradesh" | "Anantpur" | 515414 | "Anantpur(36)" | "No" | null | "Images/IISc_VaaniProject_Anant… | "IISc_VaaniProject_S_AP_Anantpu… |
"Hindi" | "['English', 'Hindi']" | "Female" | "AndhraPradesh" | "Anantpur" | 515414 | "Anantpur(36)" | "No" | null | "Images/IISc_VaaniProject_Anant… | "IISc_VaaniProject_S_AP_Anantpu… |
"Hindi" | "['English', 'Hindi']" | "Female" | "AndhraPradesh" | "Anantpur" | 515414 | "Anantpur(36)" | "No" | null | "Images/IISc_VaaniProject_Anant… | "IISc_VaaniProject_S_AP_Anantpu… |
"Hindi" | "['English', 'Hindi']" | "Female" | "AndhraPradesh" | "Anantpur" | 515414 | "Anantpur(36)" | "No" | null | "Images/IISc_VaaniProject_Anant… | "IISc_VaaniProject_S_AP_Anantpu… |
"Hindi" | "['English', 'Hindi']" | "Female" | "AndhraPradesh" | "Anantpur" | 515414 | "Anantpur(36)" | "No" | null | "Images/IISc_VaaniProject_Anant… | "IISc_VaaniProject_S_AP_Anantpu… |
"
],
"text/plain": [
"shape: (118_552, 11)\n",
"┌──────────┬────────────┬────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐\n",
"│ language ┆ languagesK ┆ gender ┆ state ┆ … ┆ isTranscri ┆ transcrip ┆ reference ┆ audio_pat │\n",
"│ --- ┆ nown ┆ --- ┆ --- ┆ ┆ ptionAvail ┆ t ┆ Image ┆ h │\n",
"│ str ┆ --- ┆ str ┆ str ┆ ┆ able ┆ --- ┆ --- ┆ --- │\n",
"│ ┆ str ┆ ┆ ┆ ┆ --- ┆ str ┆ str ┆ str │\n",
"│ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ ┆ │\n",
"╞══════════╪════════════╪════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡\n",
"│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ M_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n",
"│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE ┆ M_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… ┆ tpu… │\n",
"│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE ┆ M_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… ┆ tpu… │\n",
"│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE ┆ M_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… ┆ tpu… │\n",
"│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE ┆ M_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… ┆ tpu… │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n",
"│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n",
"│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n",
"│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n",
"│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n",
"│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n",
"└──────────┴────────────┴────────┴────────────┴───┴────────────┴───────────┴───────────┴───────────┘"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "a1f731e6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
shape: (2, 2)isTranscriptionAvailable | count |
---|
str | u32 |
"Yes" | 5420 |
"No" | 113132 |
"
],
"text/plain": [
"shape: (2, 2)\n",
"┌──────────────────────────┬────────┐\n",
"│ isTranscriptionAvailable ┆ count │\n",
"│ --- ┆ --- │\n",
"│ str ┆ u32 │\n",
"╞══════════════════════════╪════════╡\n",
"│ Yes ┆ 5420 │\n",
"│ No ┆ 113132 │\n",
"└──────────────────────────┴────────┘"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['isTranscriptionAvailable'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "eef6bd00",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'IISc_VaaniProject_M_AP_Anantpur_00014520_1544240000_APATSR_190315_1880_16300.wav'"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['audio'][0]['path']"
]
},
{
"cell_type": "markdown",
"id": "e9130188",
"metadata": {},
"source": [
"
\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ba3e08c6",
"metadata": {},
"outputs": [],
"source": [
"import datasets\n",
"from datasets import Audio\n",
"from datasets import load_dataset\n",
"from datasets import get_dataset_config_names\n",
"import os\n",
"from huggingface_hub import login\n",
"HF_TOKEN = open(\"/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token\", \"r\").read().strip()\n",
"login(token=HF_TOKEN)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "52b42738",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ad907faba3574297b89f5d659cf8726f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Resolving data files: 0%| | 0/57 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "08bd4987f85f4fcc86589eb26850c90c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Resolving data files: 0%| | 0/54 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c2aa54611d274e91b7a8f5fc974b4ca4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading dataset shards: 0%| | 0/54 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['audio', 'language', 'languagesKnown', 'gender', 'state', 'district', 'pincode', 'stay(years)', 'isTranscriptionAvailable', 'transcript', 'referenceImage'],\n",
" num_rows: 164125\n",
"})"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"district = \"Delhi_NewDelhi\"\n",
"cache_dir = f'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/{district}'\n",
"\n",
"\n",
"ds = load_dataset(\"ARTPARK-IISc/Vaani\", district, split=\"train\", num_proc=20,\n",
" token=HF_TOKEN, cache_dir=cache_dir, streaming=False)\n",
"ds = ds.cast_column(\"audio\", datasets.Audio(decode=False))\n",
"ds"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e86b30d8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing rows: 100%|\u001b[32m██████\u001b[0m| 164125/164125 [00:30<00:00, 5299.76it/s]\u001b[0m\n"
]
}
],
"source": [
"from tqdm import tqdm\n",
"import polars as pl\n",
"\n",
"# Initialize empty containers\n",
"audio_paths = []\n",
"columns = {col: [] for col in ds.column_names if col != \"audio\"}\n",
"\n",
"# Iterate with tqdm\n",
"for row in tqdm(ds, desc=\"Processing rows\", ncols=70, colour='green'):\n",
" audio_paths.append(row[\"audio\"][\"path\"])\n",
" for col in columns:\n",
" columns[col].append(row[col])\n",
"\n",
"# Combine into Polars DataFrame\n",
"df = pl.DataFrame({\n",
" \"audio_path\": audio_paths,\n",
" **columns\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "571f94f4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
shape: (164_125, 11)audio_path | language | languagesKnown | gender | state | district | pincode | stay(years) | isTranscriptionAvailable | transcript | referenceImage |
---|
str | str | str | str | str | str | i64 | str | str | str | str |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Female" | "Delhi" | "NewDelhi" | 110004 | "NewDelhi(20)" | "No" | null | "Images/IISc_VaaniProject_GENER… |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Female" | "Delhi" | "NewDelhi" | 110001 | "NewDelhi(19)" | "No" | null | "Images/IISc_VaaniProject_GENER… |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Female" | "Delhi" | "NewDelhi" | 110067 | "NewDelhi(11)" | "No" | null | "Images/IISc_VaaniProject_NewDe… |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Male" | "Delhi" | "NewDelhi" | 110001 | "NewDelhi(24)" | "Yes" | "ऐच_डी_ऐफ_सी बैंक {H_D_F_C bank… | "Images/IISc_VaaniProject_GENER… |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Male" | "Delhi" | "NewDelhi" | 110023 | "NewDelhi(20)" | "No" | null | "Images/IISc_VaaniProject_GENER… |
… | … | … | … | … | … | … | … | … | … | … |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Male" | "Delhi" | "NewDelhi" | 110046 | "NewDelhi(45)" | "No" | null | "Images/IISc_VaaniProject_NewDe… |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Female" | "Delhi" | "NewDelhi" | 110021 | "CentralDelhi(31)" | "No" | null | "Images/IISc_VaaniProject_NewDe… |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Female" | "Delhi" | "NewDelhi" | 110004 | "NewDelhi(20)" | "No" | null | "Images/IISc_VaaniProject_NewDe… |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['Hindi']" | "Male" | "Delhi" | "NewDelhi" | 110028 | "NewDelhi(26)" | "No" | null | "Images/IISc_VaaniProject_NewDe… |
"IISc_VaaniProject_M_Delhi_NewD… | "Hindi" | "['English', 'Hindi']" | "Male" | "Delhi" | "NewDelhi" | 110070 | "NewDelhi(41)" | "No" | null | "Images/IISc_VaaniProject_NewDe… |
"
],
"text/plain": [
"shape: (164_125, 11)\n",
"┌────────────┬──────────┬────────────┬────────┬───┬────────────┬───────────┬───────────┬───────────┐\n",
"│ audio_path ┆ language ┆ languagesK ┆ gender ┆ … ┆ stay(years ┆ isTranscr ┆ transcrip ┆ reference │\n",
"│ --- ┆ --- ┆ nown ┆ --- ┆ ┆ ) ┆ iptionAva ┆ t ┆ Image │\n",
"│ str ┆ str ┆ --- ┆ str ┆ ┆ --- ┆ ilable ┆ --- ┆ --- │\n",
"│ ┆ ┆ str ┆ ┆ ┆ str ┆ --- ┆ str ┆ str │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ │\n",
"╞════════════╪══════════╪════════════╪════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ NewDelhi(2 ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ 0) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ NewDelhi(1 ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ 9) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ NewDelhi(1 ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ 1) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Male ┆ … ┆ NewDelhi(2 ┆ Yes ┆ ऐच_डी_ऐफ_ ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ 4) ┆ ┆ सी बैंक ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ {H_D_F_C ┆ roject_GE │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ bank… ┆ NER… │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Male ┆ … ┆ NewDelhi(2 ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ 0) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Male ┆ … ┆ NewDelhi(4 ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ 5) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ CentralDel ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ hi(31) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ NewDelhi(2 ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ 0) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Male ┆ … ┆ NewDelhi(2 ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ ┆ ┆ ┆ 6) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n",
"│ IISc_Vaani ┆ Hindi ┆ ['English' ┆ Male ┆ … ┆ NewDelhi(4 ┆ No ┆ null ┆ Images/II │\n",
"│ Project_M_ ┆ ┆ , 'Hindi'] ┆ ┆ ┆ 1) ┆ ┆ ┆ Sc_VaaniP │\n",
"│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n",
"│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n",
"└────────────┴──────────┴────────────┴────────┴───┴────────────┴───────────┴───────────┴───────────┘"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "160b98ab",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/Delhi_NewDelhi'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cache_dir"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e781d016",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Files found: 54\n"
]
}
],
"source": [
"def walkDIR(folder_path, include=None):\n",
" file_list = []\n",
" for root, _, files in os.walk(folder_path):\n",
" for file in files:\n",
" if include is None or any(file.endswith(ext) for ext in include):\n",
" file_list.append(os.path.join(root, file))\n",
" print(\"Files found:\", len(file_list))\n",
" return file_list\n",
"\n",
"shards = walkDIR(cache_dir, include=['.arrow'])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "86c09579",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/Delhi_NewDelhi/ARTPARK-IISc___vaani/Delhi_NewDelhi/0.0.0/bebdc89de129c988e87623d031860f3abdb77ac5/vaani-train-00011-of-00054.arrow'"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shards[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a431824",
"metadata": {},
"outputs": [],
"source": [
"'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/Delhi_NewDelhi/ARTPARK-IISc___vaani/Delhi_NewDelhi/0.0.0'"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "11dd7e8f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/Delhi_NewDelhi/ARTPARK-IISc___vaani/Delhi_NewDelhi/0.0.0/bebdc89de129c988e87623d031860f3abdb77ac5/vaani-train-00011-of-00054.arrow'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shards[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2aae2b9",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset, Audio\n",
"import polars as pl\n",
"import os\n",
"from tqdm import tqdm\n",
"\n",
"def make_filter_df_from_hf(state, district, cache_dir, savedir, hf_token=None):\n",
" # Load dataset\n",
" ds = load_dataset(\n",
" \"ARTPARK-IISc/Vaani\",\n",
" district,\n",
" split=\"train\",\n",
" num_proc=20,\n",
" token=hf_token,\n",
" cache_dir=cache_dir,\n",
" streaming=False\n",
" )\n",
"\n",
" # Cast audio column to keep path only (no decoding)\n",
" ds = ds.cast_column(\"audio\", Audio(decode=False))\n",
"\n",
" # Initialize empty containers\n",
" audio_paths = []\n",
" columns = {col: [] for col in ds.column_names if col != \"audio\"}\n",
"\n",
" # Iterate with tqdm\n",
" for row in tqdm(ds, desc=f\"Processing {state}_{district}\"):\n",
" audio_paths.append(row[\"audio\"][\"path\"])\n",
" for col in columns:\n",
" columns[col].append(row[col])\n",
"\n",
" # Combine into Polars DataFrame\n",
" df = pl.DataFrame({\n",
" \"audio_path\": audio_paths,\n",
" **columns\n",
" })\n",
"\n",
" # Save as Parquet\n",
" os.makedirs(savedir, exist_ok=True)\n",
" out_path = os.path.join(savedir, f\"{state}_{district}_meta.parquet\")\n",
" df.write_parquet(out_path, compression=\"gzip\")\n",
"\n",
" print(f\"✅ Saved {state}_{district}_meta.parquet to {out_path}\")\n"
]
},
{
"cell_type": "markdown",
"id": "ccb2043e",
"metadata": {},
"source": [
"
\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6c24f6db",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from huggingface_hub import login\n",
"\n",
"# HF_TOKEN = open(\"/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token\", \"r\").read().strip()\n",
"# os.environ[\"HF_TOKEN\"] = HF_TOKEN\n",
"# login(token=HF_TOKEN)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "61363e0b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images'] \n",
"\n",
"\n"
]
}
],
"source": [
"# from datasets import get_dataset_config_names\n",
"# configs = get_dataset_config_names(\"ARTPARK-IISc/Vaani\")\n",
"\n",
"configs = ['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images']\n",
"print(configs, \"\\n\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "677169ff",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" state | \n",
" district | \n",
" url | \n",
" config_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... | \n",
" AndhraPradesh_Anantpur | \n",
"
\n",
" \n",
" 1 | \n",
" AndhraPradesh | \n",
" Chittoor | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... | \n",
" AndhraPradesh_Chittoor | \n",
"
\n",
" \n",
" 2 | \n",
" AndhraPradesh | \n",
" Guntur | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... | \n",
" AndhraPradesh_Guntur | \n",
"
\n",
" \n",
" 3 | \n",
" AndhraPradesh | \n",
" Krishna | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... | \n",
" AndhraPradesh_Krishna | \n",
"
\n",
" \n",
" 4 | \n",
" AndhraPradesh | \n",
" Srikakulam | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... | \n",
" AndhraPradesh_Srikakulam | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 116 | \n",
" WestBengal | \n",
" Malda | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... | \n",
" WestBengal_Malda | \n",
"
\n",
" \n",
" 117 | \n",
" WestBengal | \n",
" North24Parganas | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... | \n",
" WestBengal_North24Parganas | \n",
"
\n",
" \n",
" 118 | \n",
" WestBengal | \n",
" PaschimMedinipur | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... | \n",
" WestBengal_PaschimMedinipur | \n",
"
\n",
" \n",
" 119 | \n",
" WestBengal | \n",
" Purulia | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... | \n",
" WestBengal_Purulia | \n",
"
\n",
" \n",
" 120 | \n",
" images | \n",
" images | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/images/... | \n",
" images | \n",
"
\n",
" \n",
"
\n",
"
121 rows × 4 columns
\n",
"
"
],
"text/plain": [
" state district \\\n",
"0 AndhraPradesh Anantpur \n",
"1 AndhraPradesh Chittoor \n",
"2 AndhraPradesh Guntur \n",
"3 AndhraPradesh Krishna \n",
"4 AndhraPradesh Srikakulam \n",
".. ... ... \n",
"116 WestBengal Malda \n",
"117 WestBengal North24Parganas \n",
"118 WestBengal PaschimMedinipur \n",
"119 WestBengal Purulia \n",
"120 images images \n",
"\n",
" url \\\n",
"0 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n",
"1 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n",
"2 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n",
"3 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n",
"4 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n",
".. ... \n",
"116 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n",
"117 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n",
"118 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n",
"119 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n",
"120 hf://datasets/ARTPARK-IISc/Vaani/audio/images/... \n",
"\n",
" config_name \n",
"0 AndhraPradesh_Anantpur \n",
"1 AndhraPradesh_Chittoor \n",
"2 AndhraPradesh_Guntur \n",
"3 AndhraPradesh_Krishna \n",
"4 AndhraPradesh_Srikakulam \n",
".. ... \n",
"116 WestBengal_Malda \n",
"117 WestBengal_North24Parganas \n",
"118 WestBengal_PaschimMedinipur \n",
"119 WestBengal_Purulia \n",
"120 images \n",
"\n",
"[121 rows x 4 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"urls_dict = {'state': [], 'district': [], 'url': [], 'config_name':[]}\n",
"\n",
"for i in configs:\n",
" state = i.split('_')[0]\n",
" district = i.split('_')[-1]\n",
" urls_dict['state'].append(state)\n",
" urls_dict['district'].append(district)\n",
" urls_dict['url'].append(f\"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet\")\n",
" urls_dict['config_name'].append(i)\n",
"\n",
"urls_df = pd.DataFrame(urls_dict)\n",
"urls_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9b8cbec4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" state | \n",
" district | \n",
" url | \n",
" config_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 11 | \n",
" Bihar | \n",
" Begusarai | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/B... | \n",
" Bihar_Begusarai | \n",
"
\n",
" \n",
" 12 | \n",
" Bihar | \n",
" Bhagalpur | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/B... | \n",
" Bihar_Bhagalpur | \n",
"
\n",
" \n",
" 13 | \n",
" Bihar | \n",
" Darbhanga | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/D... | \n",
" Bihar_Darbhanga | \n",
"
\n",
" \n",
" 14 | \n",
" Bihar | \n",
" EastChamparan | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/E... | \n",
" Bihar_EastChamparan | \n",
"
\n",
" \n",
" 15 | \n",
" Bihar | \n",
" Gaya | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/G... | \n",
" Bihar_Gaya | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 116 | \n",
" WestBengal | \n",
" Malda | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... | \n",
" WestBengal_Malda | \n",
"
\n",
" \n",
" 117 | \n",
" WestBengal | \n",
" North24Parganas | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... | \n",
" WestBengal_North24Parganas | \n",
"
\n",
" \n",
" 118 | \n",
" WestBengal | \n",
" PaschimMedinipur | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... | \n",
" WestBengal_PaschimMedinipur | \n",
"
\n",
" \n",
" 119 | \n",
" WestBengal | \n",
" Purulia | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... | \n",
" WestBengal_Purulia | \n",
"
\n",
" \n",
" 120 | \n",
" images | \n",
" images | \n",
" hf://datasets/ARTPARK-IISc/Vaani/audio/images/... | \n",
" images | \n",
"
\n",
" \n",
"
\n",
"
110 rows × 4 columns
\n",
"
"
],
"text/plain": [
" state district \\\n",
"11 Bihar Begusarai \n",
"12 Bihar Bhagalpur \n",
"13 Bihar Darbhanga \n",
"14 Bihar EastChamparan \n",
"15 Bihar Gaya \n",
".. ... ... \n",
"116 WestBengal Malda \n",
"117 WestBengal North24Parganas \n",
"118 WestBengal PaschimMedinipur \n",
"119 WestBengal Purulia \n",
"120 images images \n",
"\n",
" url \\\n",
"11 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/B... \n",
"12 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/B... \n",
"13 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/D... \n",
"14 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/E... \n",
"15 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/G... \n",
".. ... \n",
"116 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n",
"117 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n",
"118 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n",
"119 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n",
"120 hf://datasets/ARTPARK-IISc/Vaani/audio/images/... \n",
"\n",
" config_name \n",
"11 Bihar_Begusarai \n",
"12 Bihar_Bhagalpur \n",
"13 Bihar_Darbhanga \n",
"14 Bihar_EastChamparan \n",
"15 Bihar_Gaya \n",
".. ... \n",
"116 WestBengal_Malda \n",
"117 WestBengal_North24Parganas \n",
"118 WestBengal_PaschimMedinipur \n",
"119 WestBengal_Purulia \n",
"120 images \n",
"\n",
"[110 rows x 4 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"urls_df.iloc[11:,:]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "85eef080",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"144K\t/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/hub/.locks/datasets--ARTPARK-IISc--Vaani\n"
]
}
],
"source": [
"!du -sh /home/IITB/ai-at-ieor/23m1521/.cache/huggingface/hub/.locks/datasets--ARTPARK-IISc--Vaani"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1966baa0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|\u001b[33m████████████████████████████\u001b[0m| 121/121 [00:00<00:00, 21100.56it/s]\u001b[0m"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"AndhraPradesh Anantpur hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Anantpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"AndhraPradesh Chittoor hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Chittoor/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"AndhraPradesh Guntur hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Guntur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"AndhraPradesh Krishna hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Krishna/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"AndhraPradesh Srikakulam hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Srikakulam/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"AndhraPradesh Vishakapattanam hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Vishakapattanam/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"ArunachalPradesh Longding hf://datasets/ARTPARK-IISc/Vaani/audio/ArunachalPradesh/Longding/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"ArunachalPradesh PapumPare hf://datasets/ARTPARK-IISc/Vaani/audio/ArunachalPradesh/PapumPare/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Assam KamrupMetropolitan hf://datasets/ARTPARK-IISc/Vaani/audio/Assam/KamrupMetropolitan/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Assam Sonitpur hf://datasets/ARTPARK-IISc/Vaani/audio/Assam/Sonitpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Araria hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Araria/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Begusarai hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Begusarai/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Bhagalpur hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Bhagalpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Darbhanga hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Darbhanga/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar EastChamparan hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/EastChamparan/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Gaya hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Gaya/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Gopalganj hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Gopalganj/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Jahanabad hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Jahanabad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Jamui hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Jamui/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Kaimur hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Kaimur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Katihar hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Katihar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Kishanganj hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Kishanganj/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Lakhisarai hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Lakhisarai/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Madhepura hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Madhepura/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Muzaffarpur hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Muzaffarpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Patna hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Patna/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Purnia hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Purnia/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Saharsa hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Saharsa/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Samastipur hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Samastipur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Saran hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Saran/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Sitamarhi hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Sitamarhi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Supaul hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Supaul/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar Vaishali hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Vaishali/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Bihar WestChamparan hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/WestChamparan/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chandigarh Chandigarh hf://datasets/ARTPARK-IISc/Vaani/audio/Chandigarh/Chandigarh/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Balrampur hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Balrampur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Bastar hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Bastar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Bilaspur hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Bilaspur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Jashpur hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Jashpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Kabirdham hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Kabirdham/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Korba hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Korba/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Raigarh hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Raigarh/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Rajnandgaon hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Rajnandgaon/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Sarguja hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Sarguja/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Chhattisgarh Sukma hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Sukma/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Delhi NewDelhi hf://datasets/ARTPARK-IISc/Vaani/audio/Delhi/NewDelhi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Goa NorthSouthGoa hf://datasets/ARTPARK-IISc/Vaani/audio/Goa/NorthSouthGoa/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Jharkhand Deoghar hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Deoghar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Jharkhand Garhwa hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Garhwa/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Jharkhand Jamtara hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Jamtara/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Jharkhand Palamu hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Palamu/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Jharkhand Ranchi hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Ranchi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Jharkhand Sahebganj hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Sahebganj/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Bangalore hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Bangalore/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Belgaum hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Belgaum/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Bellary hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Bellary/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Bidar hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Bidar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Bijapur hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Bijapur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Chamrajnagar hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Chamrajnagar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka DakshinKannada hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/DakshinKannada/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Dharwad hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Dharwad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Gulbarga hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Gulbarga/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Koppal hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Koppal/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Mysore hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Mysore/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Raichur hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Raichur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Karnataka Shimoga hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Shimoga/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"MadhyaPradesh Bhopal hf://datasets/ARTPARK-IISc/Vaani/audio/MadhyaPradesh/Bhopal/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"MadhyaPradesh Dhar hf://datasets/ARTPARK-IISc/Vaani/audio/MadhyaPradesh/Dhar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"MadhyaPradesh Katni hf://datasets/ARTPARK-IISc/Vaani/audio/MadhyaPradesh/Katni/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Maharashtra Aurangabad hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Aurangabad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Maharashtra Chandrapur hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Chandrapur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Maharashtra Dhule hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Dhule/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Maharashtra Nagpur hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Nagpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Maharashtra Pune hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Pune/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Maharashtra Sindhudurga hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Sindhudurga/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Maharashtra Solapur hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Solapur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Meghalaya WestGaroHills hf://datasets/ARTPARK-IISc/Vaani/audio/Meghalaya/WestGaroHills/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Nagaland Dimapur hf://datasets/ARTPARK-IISc/Vaani/audio/Nagaland/Dimapur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Nagaland Kohima hf://datasets/ARTPARK-IISc/Vaani/audio/Nagaland/Kohima/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Odisha Khordha hf://datasets/ARTPARK-IISc/Vaani/audio/Odisha/Khordha/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Rajasthan Churu hf://datasets/ARTPARK-IISc/Vaani/audio/Rajasthan/Churu/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Rajasthan Jaipur hf://datasets/ARTPARK-IISc/Vaani/audio/Rajasthan/Jaipur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Rajasthan Nagaur hf://datasets/ARTPARK-IISc/Vaani/audio/Rajasthan/Nagaur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"TamilNadu Chennai hf://datasets/ARTPARK-IISc/Vaani/audio/TamilNadu/Chennai/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"TamilNadu Kanyakumari hf://datasets/ARTPARK-IISc/Vaani/audio/TamilNadu/Kanyakumari/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"TamilNadu Namakkal hf://datasets/ARTPARK-IISc/Vaani/audio/TamilNadu/Namakkal/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"TamilNadu Nilgiris hf://datasets/ARTPARK-IISc/Vaani/audio/TamilNadu/Nilgiris/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Telangana Hyderabad hf://datasets/ARTPARK-IISc/Vaani/audio/Telangana/Hyderabad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Telangana Karimnagar hf://datasets/ARTPARK-IISc/Vaani/audio/Telangana/Karimnagar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Telangana Mahabubabad hf://datasets/ARTPARK-IISc/Vaani/audio/Telangana/Mahabubabad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Telangana Nalgonda hf://datasets/ARTPARK-IISc/Vaani/audio/Telangana/Nalgonda/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Tripura Dhalai hf://datasets/ARTPARK-IISc/Vaani/audio/Tripura/Dhalai/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Tripura Unakoti hf://datasets/ARTPARK-IISc/Vaani/audio/Tripura/Unakoti/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Tripura WestTripura hf://datasets/ARTPARK-IISc/Vaani/audio/Tripura/WestTripura/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Budaun hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Budaun/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Deoria hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Deoria/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Etah hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Etah/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Ghazipur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Ghazipur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Gorakhpur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Gorakhpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Hamirpur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Hamirpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Jalaun hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Jalaun/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh JyotibaPhuleNagar hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/JyotibaPhuleNagar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Lalitpur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Lalitpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Lucknow hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Lucknow/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Muzzaffarnagar hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Muzzaffarnagar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Saharanpur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Saharanpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"UttarPradesh Varanasi hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Varanasi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Uttarakhand TehriGarhwal hf://datasets/ARTPARK-IISc/Vaani/audio/Uttarakhand/TehriGarhwal/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"Uttarakhand Uttarkashi hf://datasets/ARTPARK-IISc/Vaani/audio/Uttarakhand/Uttarkashi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal Alipurduar hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Alipurduar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal CoochBehar hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/CoochBehar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal DakshinDinajpur hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/DakshinDinajpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal Darjeeling hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Darjeeling/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal Jalpaiguri hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Jalpaiguri/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal Jhargram hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Jhargram/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal Kolkata hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Kolkata/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal Malda hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Malda/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal North24Parganas hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/North24Parganas/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal PaschimMedinipur hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/PaschimMedinipur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"WestBengal Purulia hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Purulia/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n",
"images images hf://datasets/ARTPARK-IISc/Vaani/audio/images/images/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"import polars as pl\n",
"from tqdm import tqdm, trange\n",
"import os\n",
"\n",
"def make_filter_df(state, district, url, savedir):\n",
" df = pl.scan_parquet(url).with_columns(\n",
" pl.col('audio').struct.field('path').alias('audio_path')\n",
" ).drop('audio').collect(\n",
" ).write_parquet(\n",
" f\"{savedir}/{state}_{district}_meta.parquet\",\n",
" compression=\"gzip\"\n",
" )\n",
" print(f\"✅ Saved {state}_{district}_meta.parquet\")\n",
" \n",
"savedir = \"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\"\n",
"os.makedirs(savedir, exist_ok=True)\n",
"\n",
"for row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70):\n",
" state = row[1]['state']\n",
" district = row[1]['district']\n",
" url = row[1]['url']\n",
" print(state, district, url, savedir)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e88548fb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['AndhraPradesh_Anantpur',\n",
" 'AndhraPradesh_Chittoor',\n",
" 'AndhraPradesh_Guntur',\n",
" 'AndhraPradesh_Krishna',\n",
" 'AndhraPradesh_Srikakulam',\n",
" 'ArunachalPradesh_Longding',\n",
" 'ArunachalPradesh_PapumPare',\n",
" 'Assam_KamrupMetropolitan',\n",
" 'Assam_Sonitpur',\n",
" 'Bihar_Araria']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"done = [\"_\".join(i.split(\".\")[:-1][0].split(\"_\")[:-1]) for i in sorted(os.listdir(savedir))]\n",
"done"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}