{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "3b7de7a5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n" ] } ], "source": [ "import os\n", "from huggingface_hub import login\n", "\n", "HF_TOKEN = open(\"/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token\", \"r\").read().strip()\n", "os.environ[\"HF_TOKEN\"] = HF_TOKEN\n", "login(token=HF_TOKEN)" ] }, { "cell_type": "code", "execution_count": 1, "id": "943e48ba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)\n", "

\n", "
Parquet SCAN [https://huggingface.co/datasets/ARTPARK-IISc%2FVaani/resolve/main/audio%2FAndhraPradesh%2FAnantpur%2Ftrain-00000-of-00057.parquet, ... 56 other sources] [id: 140121137377680]

PROJECT */11 COLUMNS
" ], "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import polars as pl\n", "\n", "df2 = pl.scan_parquet('hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Anantpur/train-*.parquet')\n", "df2" ] }, { "cell_type": "code", "execution_count": 2, "id": "190bd8f9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)\n", "

\n", "
simple π 11/11 [\"language\", \"languagesKnown\", ... 9 other columns]

WITH_COLUMNS:

[col(\"audio\").struct.field_by_name(path)().alias(\"audio_path\")]

Parquet SCAN [https://huggingface.co/datasets/ARTPARK-IISc%2FVaani/resolve/main/audio%2FAndhraPradesh%2FAnantpur%2Ftrain-00000-of-00057.parquet, ... 56 other sources] [id: 140121137377680]

PROJECT */11 COLUMNS
" ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df2.with_columns(\n", "\tpl.col('audio').struct.field('path').alias('audio_path')\n", ").drop('audio')\n", "df2" ] }, { "cell_type": "code", "execution_count": 3, "id": "1f881998", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_2159560/3936327964.py:1: PerformanceWarning: Determining the column names of a LazyFrame requires resolving its schema, which is a potentially expensive operation. Use `LazyFrame.collect_schema().names()` to get the column names without this warning.\n", " df2.columns\n" ] }, { "data": { "text/plain": [ "['language',\n", " 'languagesKnown',\n", " 'gender',\n", " 'state',\n", " 'district',\n", " 'pincode',\n", " 'stay(years)',\n", " 'isTranscriptionAvailable',\n", " 'transcript',\n", " 'referenceImage',\n", " 'audio_path']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.columns" ] }, { "cell_type": "code", "execution_count": 4, "id": "9b32a79a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2, 1)
isTranscriptionAvailable
struct[2]
{"No",113132}
{"Yes",5420}
" ], "text/plain": [ "shape: (2, 1)\n", "┌──────────────────────────┐\n", "│ isTranscriptionAvailable │\n", "│ --- │\n", "│ struct[2] │\n", "╞══════════════════════════╡\n", "│ {\"No\",113132} │\n", "│ {\"Yes\",5420} │\n", "└──────────────────────────┘" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.select(pl.col('isTranscriptionAvailable').value_counts()).collect()" ] }, { "cell_type": "code", "execution_count": 6, "id": "035af4ce", "metadata": {}, "outputs": [], "source": [ "df2.collect().write_parquet(\"AndhraPradesh_Anantpur_meta.parquet\", compression=\"gzip\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d87a8f25", "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "\n", "df = pl.read_parquet('hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Anantpur/train-*.parquet')\n", "df" ] }, { "cell_type": "code", "execution_count": 7, "id": "e055da50", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n" ] }, { "data": { "text/html": [ "
Dask DataFrame Structure:
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
audiolanguagelanguagesKnowngenderstatedistrictpincodestay(years)isTranscriptionAvailabletranscriptreferenceImage
npartitions=54
objectstringstringstringstringstringint64stringstringstringstring
.................................
....................................
.................................
.................................
\n", "
Dask Name: read_parquet, 1 expression
" ], "text/plain": [ "Dask DataFrame Structure:\n", " audio language languagesKnown gender state district pincode stay(years) isTranscriptionAvailable transcript referenceImage\n", "npartitions=54 \n", " object string string string string string int64 string string string string\n", " ... ... ... ... ... ... ... ... ... ... ...\n", "... ... ... ... ... ... ... ... ... ... ... ...\n", " ... ... ... ... ... ... ... ... ... ... ...\n", " ... ... ... ... ... ... ... ... ... ... ...\n", "Dask Name: read_parquet, 1 expression\n", "Expr=ReadParquetFSSpec(446f636)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from huggingface_hub import login\n", "\n", "HF_TOKEN = open(\"/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token\", \"r\").read().strip()\n", "os.environ[\"HF_TOKEN\"] = HF_TOKEN\n", "login(token=HF_TOKEN)\n", "\n", "import dask.dataframe as dd\n", "\n", "df = dd.read_parquet(\"hf://datasets/ARTPARK-IISc/Vaani/audio/Delhi/NewDelhi/train-*.parquet\")\n", "df " ] }, { "cell_type": "code", "execution_count": 9, "id": "d13cc82c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Dask DataFrame Structure:
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
languagelanguagesKnowngenderstatedistrictpincodestay(years)isTranscriptionAvailabletranscriptreferenceImageaudio_path
npartitions=54
stringstringstringstringstringint64stringstringstringstringobject
.................................
....................................
.................................
.................................
\n", "
Dask Name: drop_by_shallow_copy, 5 expressions
" ], "text/plain": [ "Dask DataFrame Structure:\n", " language languagesKnown gender state district pincode stay(years) isTranscriptionAvailable transcript referenceImage audio_path\n", "npartitions=54 \n", " string string string string string int64 string string string string object\n", " ... ... ... ... ... ... ... ... ... ... ...\n", "... ... ... ... ... ... ... ... ... ... ... ...\n", " ... ... ... ... ... ... ... ... ... ... ...\n", " ... ... ... ... ... ... ... ... ... ... ...\n", "Dask Name: drop_by_shallow_copy, 5 expressions\n", "Expr=Drop(frame=Assign(frame=ReadParquetFSSpec(446f636)), columns='audio')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['audio_path'] = df['audio'].map(lambda x: x.get('path') if isinstance(x, dict) else None, meta=('audio_path', 'object'))\n", "\n", "# Drop the original 'audio' column\n", "df = df.drop(columns='audio')\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "id": "bb0dde46", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "'(ReadTimeoutError(\"HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: e0ac54e2-ea47-4a2a-80a4-c54485398fba)')' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00028-of-00054.parquet\n", "Retrying in 1s [Retry 1/5].\n", "'(ReadTimeoutError(\"HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: f7da590c-34ef-45f7-a824-ceca80b768ff)')' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00043-of-00054.parquet\n", "Retrying in 1s [Retry 1/5].\n", "'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00003-of-00054.parquet\n", "Retrying in 1s [Retry 1/5].\n", "'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00044-of-00054.parquet\n", "Retrying in 1s [Retry 1/5].\n", "'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00043-of-00054.parquet\n", "Retrying in 2s [Retry 2/5].\n", "'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00017-of-00054.parquet\n", "Retrying in 1s [Retry 1/5].\n", "'HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ARTPARK-IISc/Vaani/resolve/main/audio/Delhi/NewDelhi/train-00003-of-00054.parquet\n", "Retrying in 1s [Retry 1/5].\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 3\u001b[39m district = \u001b[33m'\u001b[39m\u001b[33mNewDelhi\u001b[39m\u001b[33m'\u001b[39m\n\u001b[32m 4\u001b[39m output_path = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msavedir\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstate\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdistrict\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_meta.parquet\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mgzip\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mpyarrow\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwrite_index\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/site-packages/dask/dataframe/dask_expr/_collection.py:3314\u001b[39m, in \u001b[36mDataFrame.to_parquet\u001b[39m\u001b[34m(self, path, **kwargs)\u001b[39m\n\u001b[32m 3311\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mto_parquet\u001b[39m(\u001b[38;5;28mself\u001b[39m, path, **kwargs):\n\u001b[32m 3312\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdask\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdataframe\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdask_expr\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mio\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mparquet\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m to_parquet\n\u001b[32m-> \u001b[39m\u001b[32m3314\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/site-packages/dask/dataframe/dask_expr/io/parquet.py:661\u001b[39m, in \u001b[36mto_parquet\u001b[39m\u001b[34m(df, path, compression, write_index, append, overwrite, ignore_divisions, partition_on, storage_options, custom_metadata, write_metadata_file, compute, compute_kwargs, schema, name_function, filesystem, engine, **kwargs)\u001b[39m\n\u001b[32m 637\u001b[39m out = new_collection(\n\u001b[32m 638\u001b[39m ToParquet(\n\u001b[32m 639\u001b[39m df,\n\u001b[32m (...)\u001b[39m\u001b[32m 657\u001b[39m )\n\u001b[32m 658\u001b[39m )\n\u001b[32m 660\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m compute:\n\u001b[32m--> \u001b[39m\u001b[32m661\u001b[39m out = \u001b[43mout\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mcompute_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 663\u001b[39m \u001b[38;5;66;03m# Invalidate the filesystem listing cache for the output path after write.\u001b[39;00m\n\u001b[32m 664\u001b[39m \u001b[38;5;66;03m# We do this before returning, even if `compute=False`. This helps ensure\u001b[39;00m\n\u001b[32m 665\u001b[39m \u001b[38;5;66;03m# that reading files that were just written succeeds.\u001b[39;00m\n\u001b[32m 666\u001b[39m fs.invalidate_cache(path)\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/site-packages/dask/base.py:373\u001b[39m, in \u001b[36mDaskMethodsMixin.compute\u001b[39m\u001b[34m(self, **kwargs)\u001b[39m\n\u001b[32m 349\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute\u001b[39m(\u001b[38;5;28mself\u001b[39m, **kwargs):\n\u001b[32m 350\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Compute this dask collection\u001b[39;00m\n\u001b[32m 351\u001b[39m \n\u001b[32m 352\u001b[39m \u001b[33;03m This turns a lazy Dask collection into its in-memory equivalent.\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 371\u001b[39m \u001b[33;03m dask.compute\u001b[39;00m\n\u001b[32m 372\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m373\u001b[39m (result,) = \u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraverse\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 374\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/site-packages/dask/base.py:681\u001b[39m, in \u001b[36mcompute\u001b[39m\u001b[34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[39m\n\u001b[32m 678\u001b[39m expr = expr.optimize()\n\u001b[32m 679\u001b[39m keys = \u001b[38;5;28mlist\u001b[39m(flatten(expr.__dask_keys__()))\n\u001b[32m--> \u001b[39m\u001b[32m681\u001b[39m results = \u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 683\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m repack(results)\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/queue.py:202\u001b[39m, in \u001b[36mQueue.get\u001b[39m\u001b[34m(self, block, timeout)\u001b[39m\n\u001b[32m 200\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 201\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._qsize():\n\u001b[32m--> \u001b[39m\u001b[32m202\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnot_empty\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 203\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.is_shutdown \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._qsize():\n\u001b[32m 204\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ShutDown\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.conda/envs/aku/lib/python3.13/threading.py:359\u001b[39m, in \u001b[36mCondition.wait\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m 357\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[32m 358\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m359\u001b[39m \u001b[43mwaiter\u001b[49m\u001b[43m.\u001b[49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 360\u001b[39m gotit = \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[32m 361\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\u001b[31mKeyboardInterrupt\u001b[39m: " ] } ], "source": [ "savedir = \"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\"\n", "state = 'Delhi'\n", "district = 'NewDelhi'\n", "output_path = f\"{savedir}/{state}_{district}_meta.parquet\"\n", "df.to_parquet(output_path, compression='gzip', engine='pyarrow', write_index=False)" ] }, { "cell_type": "code", "execution_count": 31, "id": "980fc707", "metadata": {}, "outputs": [], "source": [ "df = df.with_columns(df['audio'].struct.field('path').alias('audio_path')).drop('audio')" ] }, { "cell_type": "code", "execution_count": 32, "id": "5fb10cde", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (118_552, 11)
languagelanguagesKnowngenderstatedistrictpincodestay(years)isTranscriptionAvailabletranscriptreferenceImageaudio_path
strstrstrstrstri64strstrstrstrstr
"Telugu""['Telugu', 'English']""Female""AndhraPradesh""Anantpur"515671"NA(18)""No"null"Images/IISc_VaaniProject_Anant…"IISc_VaaniProject_M_AP_Anantpu…
"Telugu""['Telugu', 'English']""Female""AndhraPradesh""Anantpur"515671"NA(18)""No"null"Images/IISc_VaaniProject_GENER…"IISc_VaaniProject_M_AP_Anantpu…
"Telugu""['Telugu', 'English']""Female""AndhraPradesh""Anantpur"515671"NA(18)""No"null"Images/IISc_VaaniProject_GENER…"IISc_VaaniProject_M_AP_Anantpu…
"Telugu""['Telugu', 'English']""Female""AndhraPradesh""Anantpur"515671"NA(18)""No"null"Images/IISc_VaaniProject_GENER…"IISc_VaaniProject_M_AP_Anantpu…
"Telugu""['Telugu', 'English']""Female""AndhraPradesh""Anantpur"515671"NA(18)""No"null"Images/IISc_VaaniProject_GENER…"IISc_VaaniProject_M_AP_Anantpu…
"Hindi""['English', 'Hindi']""Female""AndhraPradesh""Anantpur"515414"Anantpur(36)""No"null"Images/IISc_VaaniProject_Anant…"IISc_VaaniProject_S_AP_Anantpu…
"Hindi""['English', 'Hindi']""Female""AndhraPradesh""Anantpur"515414"Anantpur(36)""No"null"Images/IISc_VaaniProject_Anant…"IISc_VaaniProject_S_AP_Anantpu…
"Hindi""['English', 'Hindi']""Female""AndhraPradesh""Anantpur"515414"Anantpur(36)""No"null"Images/IISc_VaaniProject_Anant…"IISc_VaaniProject_S_AP_Anantpu…
"Hindi""['English', 'Hindi']""Female""AndhraPradesh""Anantpur"515414"Anantpur(36)""No"null"Images/IISc_VaaniProject_Anant…"IISc_VaaniProject_S_AP_Anantpu…
"Hindi""['English', 'Hindi']""Female""AndhraPradesh""Anantpur"515414"Anantpur(36)""No"null"Images/IISc_VaaniProject_Anant…"IISc_VaaniProject_S_AP_Anantpu…
" ], "text/plain": [ "shape: (118_552, 11)\n", "┌──────────┬────────────┬────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐\n", "│ language ┆ languagesK ┆ gender ┆ state ┆ … ┆ isTranscri ┆ transcrip ┆ reference ┆ audio_pat │\n", "│ --- ┆ nown ┆ --- ┆ --- ┆ ┆ ptionAvail ┆ t ┆ Image ┆ h │\n", "│ str ┆ --- ┆ str ┆ str ┆ ┆ able ┆ --- ┆ --- ┆ --- │\n", "│ ┆ str ┆ ┆ ┆ ┆ --- ┆ str ┆ str ┆ str │\n", "│ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ ┆ │\n", "╞══════════╪════════════╪════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡\n", "│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ M_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n", "│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE ┆ M_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… ┆ tpu… │\n", "│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE ┆ M_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… ┆ tpu… │\n", "│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE ┆ M_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… ┆ tpu… │\n", "│ Telugu ┆ ['Telugu', ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ 'English'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE ┆ M_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… ┆ tpu… │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", "│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n", "│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n", "│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n", "│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n", "│ Hindi ┆ ['English' ┆ Female ┆ AndhraPrad ┆ … ┆ No ┆ null ┆ Images/II ┆ IISc_Vaan │\n", "│ ┆ , 'Hindi'] ┆ ┆ esh ┆ ┆ ┆ ┆ Sc_VaaniP ┆ iProject_ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_An ┆ S_AP_Anan │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ant… ┆ tpu… │\n", "└──────────┴────────────┴────────┴────────────┴───┴────────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 38, "id": "a1f731e6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2, 2)
isTranscriptionAvailablecount
stru32
"Yes"5420
"No"113132
" ], "text/plain": [ "shape: (2, 2)\n", "┌──────────────────────────┬────────┐\n", "│ isTranscriptionAvailable ┆ count │\n", "│ --- ┆ --- │\n", "│ str ┆ u32 │\n", "╞══════════════════════════╪════════╡\n", "│ Yes ┆ 5420 │\n", "│ No ┆ 113132 │\n", "└──────────────────────────┴────────┘" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['isTranscriptionAvailable'].value_counts()" ] }, { "cell_type": "code", "execution_count": 27, "id": "eef6bd00", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'IISc_VaaniProject_M_AP_Anantpur_00014520_1544240000_APATSR_190315_1880_16300.wav'" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['audio'][0]['path']" ] }, { "cell_type": "markdown", "id": "e9130188", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "code", "execution_count": 1, "id": "ba3e08c6", "metadata": {}, "outputs": [], "source": [ "import datasets\n", "from datasets import Audio\n", "from datasets import load_dataset\n", "from datasets import get_dataset_config_names\n", "import os\n", "from huggingface_hub import login\n", "HF_TOKEN = open(\"/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token\", \"r\").read().strip()\n", "login(token=HF_TOKEN)" ] }, { "cell_type": "code", "execution_count": 2, "id": "52b42738", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ad907faba3574297b89f5d659cf8726f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Resolving data files: 0%| | 0/57 [00:00\n", "shape: (164_125, 11)
audio_pathlanguagelanguagesKnowngenderstatedistrictpincodestay(years)isTranscriptionAvailabletranscriptreferenceImage
strstrstrstrstrstri64strstrstrstr
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Female""Delhi""NewDelhi"110004"NewDelhi(20)""No"null"Images/IISc_VaaniProject_GENER…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Female""Delhi""NewDelhi"110001"NewDelhi(19)""No"null"Images/IISc_VaaniProject_GENER…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Female""Delhi""NewDelhi"110067"NewDelhi(11)""No"null"Images/IISc_VaaniProject_NewDe…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Male""Delhi""NewDelhi"110001"NewDelhi(24)""Yes""ऐच_डी_ऐफ_सी बैंक {H_D_F_C bank…"Images/IISc_VaaniProject_GENER…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Male""Delhi""NewDelhi"110023"NewDelhi(20)""No"null"Images/IISc_VaaniProject_GENER…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Male""Delhi""NewDelhi"110046"NewDelhi(45)""No"null"Images/IISc_VaaniProject_NewDe…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Female""Delhi""NewDelhi"110021"CentralDelhi(31)""No"null"Images/IISc_VaaniProject_NewDe…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Female""Delhi""NewDelhi"110004"NewDelhi(20)""No"null"Images/IISc_VaaniProject_NewDe…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['Hindi']""Male""Delhi""NewDelhi"110028"NewDelhi(26)""No"null"Images/IISc_VaaniProject_NewDe…
"IISc_VaaniProject_M_Delhi_NewD…"Hindi""['English', 'Hindi']""Male""Delhi""NewDelhi"110070"NewDelhi(41)""No"null"Images/IISc_VaaniProject_NewDe…
" ], "text/plain": [ "shape: (164_125, 11)\n", "┌────────────┬──────────┬────────────┬────────┬───┬────────────┬───────────┬───────────┬───────────┐\n", "│ audio_path ┆ language ┆ languagesK ┆ gender ┆ … ┆ stay(years ┆ isTranscr ┆ transcrip ┆ reference │\n", "│ --- ┆ --- ┆ nown ┆ --- ┆ ┆ ) ┆ iptionAva ┆ t ┆ Image │\n", "│ str ┆ str ┆ --- ┆ str ┆ ┆ --- ┆ ilable ┆ --- ┆ --- │\n", "│ ┆ ┆ str ┆ ┆ ┆ str ┆ --- ┆ str ┆ str │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ │\n", "╞════════════╪══════════╪════════════╪════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ NewDelhi(2 ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ 0) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… │\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ NewDelhi(1 ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ 9) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… │\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ NewDelhi(1 ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ 1) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Male ┆ … ┆ NewDelhi(2 ┆ Yes ┆ ऐच_डी_ऐफ_ ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ 4) ┆ ┆ सी बैंक ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ {H_D_F_C ┆ roject_GE │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ bank… ┆ NER… │\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Male ┆ … ┆ NewDelhi(2 ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ 0) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_GE │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ NER… │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Male ┆ … ┆ NewDelhi(4 ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ 5) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ CentralDel ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ hi(31) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Female ┆ … ┆ NewDelhi(2 ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ 0) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n", "│ IISc_Vaani ┆ Hindi ┆ ['Hindi'] ┆ Male ┆ … ┆ NewDelhi(2 ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ ┆ ┆ ┆ 6) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n", "│ IISc_Vaani ┆ Hindi ┆ ['English' ┆ Male ┆ … ┆ NewDelhi(4 ┆ No ┆ null ┆ Images/II │\n", "│ Project_M_ ┆ ┆ , 'Hindi'] ┆ ┆ ┆ 1) ┆ ┆ ┆ Sc_VaaniP │\n", "│ Delhi_NewD ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ roject_Ne │\n", "│ … ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ wDe… │\n", "└────────────┴──────────┴────────────┴────────┴───┴────────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 12, "id": "160b98ab", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/Delhi_NewDelhi'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cache_dir" ] }, { "cell_type": "code", "execution_count": 6, "id": "e781d016", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Files found: 54\n" ] } ], "source": [ "def walkDIR(folder_path, include=None):\n", " file_list = []\n", " for root, _, files in os.walk(folder_path):\n", " for file in files:\n", " if include is None or any(file.endswith(ext) for ext in include):\n", " file_list.append(os.path.join(root, file))\n", " print(\"Files found:\", len(file_list))\n", " return file_list\n", "\n", "shards = walkDIR(cache_dir, include=['.arrow'])" ] }, { "cell_type": "code", "execution_count": 18, "id": "86c09579", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/Delhi_NewDelhi/ARTPARK-IISc___vaani/Delhi_NewDelhi/0.0.0/bebdc89de129c988e87623d031860f3abdb77ac5/vaani-train-00011-of-00054.arrow'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shards[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "4a431824", "metadata": {}, "outputs": [], "source": [ "'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/Delhi_NewDelhi/ARTPARK-IISc___vaani/Delhi_NewDelhi/0.0.0'" ] }, { "cell_type": "code", "execution_count": 11, "id": "11dd7e8f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audio-Cache/Hindi/Delhi_NewDelhi/ARTPARK-IISc___vaani/Delhi_NewDelhi/0.0.0/bebdc89de129c988e87623d031860f3abdb77ac5/vaani-train-00011-of-00054.arrow'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shards[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "c2aae2b9", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, Audio\n", "import polars as pl\n", "import os\n", "from tqdm import tqdm\n", "\n", "def make_filter_df_from_hf(state, district, cache_dir, savedir, hf_token=None):\n", " # Load dataset\n", " ds = load_dataset(\n", " \"ARTPARK-IISc/Vaani\",\n", " district,\n", " split=\"train\",\n", " num_proc=20,\n", " token=hf_token,\n", " cache_dir=cache_dir,\n", " streaming=False\n", " )\n", "\n", " # Cast audio column to keep path only (no decoding)\n", " ds = ds.cast_column(\"audio\", Audio(decode=False))\n", "\n", " # Initialize empty containers\n", " audio_paths = []\n", " columns = {col: [] for col in ds.column_names if col != \"audio\"}\n", "\n", " # Iterate with tqdm\n", " for row in tqdm(ds, desc=f\"Processing {state}_{district}\"):\n", " audio_paths.append(row[\"audio\"][\"path\"])\n", " for col in columns:\n", " columns[col].append(row[col])\n", "\n", " # Combine into Polars DataFrame\n", " df = pl.DataFrame({\n", " \"audio_path\": audio_paths,\n", " **columns\n", " })\n", "\n", " # Save as Parquet\n", " os.makedirs(savedir, exist_ok=True)\n", " out_path = os.path.join(savedir, f\"{state}_{district}_meta.parquet\")\n", " df.write_parquet(out_path, compression=\"gzip\")\n", "\n", " print(f\"✅ Saved {state}_{district}_meta.parquet to {out_path}\")\n" ] }, { "cell_type": "markdown", "id": "ccb2043e", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "code", "execution_count": 1, "id": "6c24f6db", "metadata": {}, "outputs": [], "source": [ "import os\n", "from huggingface_hub import login\n", "\n", "# HF_TOKEN = open(\"/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/token\", \"r\").read().strip()\n", "# os.environ[\"HF_TOKEN\"] = HF_TOKEN\n", "# login(token=HF_TOKEN)" ] }, { "cell_type": "code", "execution_count": 2, "id": "61363e0b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images'] \n", "\n", "\n" ] } ], "source": [ "# from datasets import get_dataset_config_names\n", "# configs = get_dataset_config_names(\"ARTPARK-IISc/Vaani\")\n", "\n", "configs = ['AndhraPradesh_Anantpur', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding', 'ArunachalPradesh_PapumPare', 'Assam_KamrupMetropolitan', 'Assam_Sonitpur', 'Bihar_Araria', 'Bihar_Begusarai', 'Bihar_Bhagalpur', 'Bihar_Darbhanga', 'Bihar_EastChamparan', 'Bihar_Gaya', 'Bihar_Gopalganj', 'Bihar_Jahanabad', 'Bihar_Jamui', 'Bihar_Kaimur', 'Bihar_Katihar', 'Bihar_Kishanganj', 'Bihar_Lakhisarai', 'Bihar_Madhepura', 'Bihar_Muzaffarpur', 'Bihar_Patna', 'Bihar_Purnia', 'Bihar_Saharsa', 'Bihar_Samastipur', 'Bihar_Saran', 'Bihar_Sitamarhi', 'Bihar_Supaul', 'Bihar_Vaishali', 'Bihar_WestChamparan', 'Chandigarh_Chandigarh', 'Chhattisgarh_Balrampur', 'Chhattisgarh_Bastar', 'Chhattisgarh_Bilaspur', 'Chhattisgarh_Jashpur', 'Chhattisgarh_Kabirdham', 'Chhattisgarh_Korba', 'Chhattisgarh_Raigarh', 'Chhattisgarh_Rajnandgaon', 'Chhattisgarh_Sarguja', 'Chhattisgarh_Sukma', 'Delhi_NewDelhi', 'Goa_NorthSouthGoa', 'Jharkhand_Deoghar', 'Jharkhand_Garhwa', 'Jharkhand_Jamtara', 'Jharkhand_Palamu', 'Jharkhand_Ranchi', 'Jharkhand_Sahebganj', 'Karnataka_Bangalore', 'Karnataka_Belgaum', 'Karnataka_Bellary', 'Karnataka_Bidar', 'Karnataka_Bijapur', 'Karnataka_Chamrajnagar', 'Karnataka_DakshinKannada', 'Karnataka_Dharwad', 'Karnataka_Gulbarga', 'Karnataka_Koppal', 'Karnataka_Mysore', 'Karnataka_Raichur', 'Karnataka_Shimoga', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Dhar', 'MadhyaPradesh_Katni', 'Maharashtra_Aurangabad', 'Maharashtra_Chandrapur', 'Maharashtra_Dhule', 'Maharashtra_Nagpur', 'Maharashtra_Pune', 'Maharashtra_Sindhudurga', 'Maharashtra_Solapur', 'Meghalaya_WestGaroHills', 'Nagaland_Dimapur', 'Nagaland_Kohima', 'Odisha_Khordha', 'Rajasthan_Churu', 'Rajasthan_Jaipur', 'Rajasthan_Nagaur', 'TamilNadu_Chennai', 'TamilNadu_Kanyakumari', 'TamilNadu_Namakkal', 'TamilNadu_Nilgiris', 'Telangana_Hyderabad', 'Telangana_Karimnagar', 'Telangana_Mahabubabad', 'Telangana_Nalgonda', 'Tripura_Dhalai', 'Tripura_Unakoti', 'Tripura_WestTripura', 'UttarPradesh_Budaun', 'UttarPradesh_Deoria', 'UttarPradesh_Etah', 'UttarPradesh_Ghazipur', 'UttarPradesh_Gorakhpur', 'UttarPradesh_Hamirpur', 'UttarPradesh_Jalaun', 'UttarPradesh_JyotibaPhuleNagar', 'UttarPradesh_Lalitpur', 'UttarPradesh_Lucknow', 'UttarPradesh_Muzzaffarnagar', 'UttarPradesh_Saharanpur', 'UttarPradesh_Varanasi', 'Uttarakhand_TehriGarhwal', 'Uttarakhand_Uttarkashi', 'WestBengal_Alipurduar', 'WestBengal_CoochBehar', 'WestBengal_DakshinDinajpur', 'WestBengal_Darjeeling', 'WestBengal_Jalpaiguri', 'WestBengal_Jhargram', 'WestBengal_Kolkata', 'WestBengal_Malda', 'WestBengal_North24Parganas', 'WestBengal_PaschimMedinipur', 'WestBengal_Purulia', 'images']\n", "print(configs, \"\\n\\n\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "677169ff", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statedistricturlconfig_name
0AndhraPradeshAnantpurhf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP...AndhraPradesh_Anantpur
1AndhraPradeshChittoorhf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP...AndhraPradesh_Chittoor
2AndhraPradeshGunturhf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP...AndhraPradesh_Guntur
3AndhraPradeshKrishnahf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP...AndhraPradesh_Krishna
4AndhraPradeshSrikakulamhf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP...AndhraPradesh_Srikakulam
...............
116WestBengalMaldahf://datasets/ARTPARK-IISc/Vaani/audio/WestBen...WestBengal_Malda
117WestBengalNorth24Parganashf://datasets/ARTPARK-IISc/Vaani/audio/WestBen...WestBengal_North24Parganas
118WestBengalPaschimMedinipurhf://datasets/ARTPARK-IISc/Vaani/audio/WestBen...WestBengal_PaschimMedinipur
119WestBengalPuruliahf://datasets/ARTPARK-IISc/Vaani/audio/WestBen...WestBengal_Purulia
120imagesimageshf://datasets/ARTPARK-IISc/Vaani/audio/images/...images
\n", "

121 rows × 4 columns

\n", "
" ], "text/plain": [ " state district \\\n", "0 AndhraPradesh Anantpur \n", "1 AndhraPradesh Chittoor \n", "2 AndhraPradesh Guntur \n", "3 AndhraPradesh Krishna \n", "4 AndhraPradesh Srikakulam \n", ".. ... ... \n", "116 WestBengal Malda \n", "117 WestBengal North24Parganas \n", "118 WestBengal PaschimMedinipur \n", "119 WestBengal Purulia \n", "120 images images \n", "\n", " url \\\n", "0 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n", "1 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n", "2 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n", "3 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n", "4 hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraP... \n", ".. ... \n", "116 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n", "117 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n", "118 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n", "119 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n", "120 hf://datasets/ARTPARK-IISc/Vaani/audio/images/... \n", "\n", " config_name \n", "0 AndhraPradesh_Anantpur \n", "1 AndhraPradesh_Chittoor \n", "2 AndhraPradesh_Guntur \n", "3 AndhraPradesh_Krishna \n", "4 AndhraPradesh_Srikakulam \n", ".. ... \n", "116 WestBengal_Malda \n", "117 WestBengal_North24Parganas \n", "118 WestBengal_PaschimMedinipur \n", "119 WestBengal_Purulia \n", "120 images \n", "\n", "[121 rows x 4 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "urls_dict = {'state': [], 'district': [], 'url': [], 'config_name':[]}\n", "\n", "for i in configs:\n", " state = i.split('_')[0]\n", " district = i.split('_')[-1]\n", " urls_dict['state'].append(state)\n", " urls_dict['district'].append(district)\n", " urls_dict['url'].append(f\"hf://datasets/ARTPARK-IISc/Vaani/audio/{state}/{district}/train-*.parquet\")\n", " urls_dict['config_name'].append(i)\n", "\n", "urls_df = pd.DataFrame(urls_dict)\n", "urls_df" ] }, { "cell_type": "code", "execution_count": 5, "id": "9b8cbec4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statedistricturlconfig_name
11BiharBegusaraihf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/B...Bihar_Begusarai
12BiharBhagalpurhf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/B...Bihar_Bhagalpur
13BiharDarbhangahf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/D...Bihar_Darbhanga
14BiharEastChamparanhf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/E...Bihar_EastChamparan
15BiharGayahf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/G...Bihar_Gaya
...............
116WestBengalMaldahf://datasets/ARTPARK-IISc/Vaani/audio/WestBen...WestBengal_Malda
117WestBengalNorth24Parganashf://datasets/ARTPARK-IISc/Vaani/audio/WestBen...WestBengal_North24Parganas
118WestBengalPaschimMedinipurhf://datasets/ARTPARK-IISc/Vaani/audio/WestBen...WestBengal_PaschimMedinipur
119WestBengalPuruliahf://datasets/ARTPARK-IISc/Vaani/audio/WestBen...WestBengal_Purulia
120imagesimageshf://datasets/ARTPARK-IISc/Vaani/audio/images/...images
\n", "

110 rows × 4 columns

\n", "
" ], "text/plain": [ " state district \\\n", "11 Bihar Begusarai \n", "12 Bihar Bhagalpur \n", "13 Bihar Darbhanga \n", "14 Bihar EastChamparan \n", "15 Bihar Gaya \n", ".. ... ... \n", "116 WestBengal Malda \n", "117 WestBengal North24Parganas \n", "118 WestBengal PaschimMedinipur \n", "119 WestBengal Purulia \n", "120 images images \n", "\n", " url \\\n", "11 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/B... \n", "12 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/B... \n", "13 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/D... \n", "14 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/E... \n", "15 hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/G... \n", ".. ... \n", "116 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n", "117 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n", "118 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n", "119 hf://datasets/ARTPARK-IISc/Vaani/audio/WestBen... \n", "120 hf://datasets/ARTPARK-IISc/Vaani/audio/images/... \n", "\n", " config_name \n", "11 Bihar_Begusarai \n", "12 Bihar_Bhagalpur \n", "13 Bihar_Darbhanga \n", "14 Bihar_EastChamparan \n", "15 Bihar_Gaya \n", ".. ... \n", "116 WestBengal_Malda \n", "117 WestBengal_North24Parganas \n", "118 WestBengal_PaschimMedinipur \n", "119 WestBengal_Purulia \n", "120 images \n", "\n", "[110 rows x 4 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urls_df.iloc[11:,:]" ] }, { "cell_type": "code", "execution_count": 6, "id": "85eef080", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "144K\t/home/IITB/ai-at-ieor/23m1521/.cache/huggingface/hub/.locks/datasets--ARTPARK-IISc--Vaani\n" ] } ], "source": [ "!du -sh /home/IITB/ai-at-ieor/23m1521/.cache/huggingface/hub/.locks/datasets--ARTPARK-IISc--Vaani" ] }, { "cell_type": "code", "execution_count": 6, "id": "1966baa0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|\u001b[33m████████████████████████████\u001b[0m| 121/121 [00:00<00:00, 21100.56it/s]\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "AndhraPradesh Anantpur hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Anantpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "AndhraPradesh Chittoor hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Chittoor/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "AndhraPradesh Guntur hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Guntur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "AndhraPradesh Krishna hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Krishna/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "AndhraPradesh Srikakulam hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Srikakulam/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "AndhraPradesh Vishakapattanam hf://datasets/ARTPARK-IISc/Vaani/audio/AndhraPradesh/Vishakapattanam/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "ArunachalPradesh Longding hf://datasets/ARTPARK-IISc/Vaani/audio/ArunachalPradesh/Longding/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "ArunachalPradesh PapumPare hf://datasets/ARTPARK-IISc/Vaani/audio/ArunachalPradesh/PapumPare/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Assam KamrupMetropolitan hf://datasets/ARTPARK-IISc/Vaani/audio/Assam/KamrupMetropolitan/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Assam Sonitpur hf://datasets/ARTPARK-IISc/Vaani/audio/Assam/Sonitpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Araria hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Araria/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Begusarai hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Begusarai/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Bhagalpur hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Bhagalpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Darbhanga hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Darbhanga/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar EastChamparan hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/EastChamparan/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Gaya hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Gaya/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Gopalganj hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Gopalganj/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Jahanabad hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Jahanabad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Jamui hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Jamui/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Kaimur hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Kaimur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Katihar hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Katihar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Kishanganj hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Kishanganj/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Lakhisarai hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Lakhisarai/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Madhepura hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Madhepura/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Muzaffarpur hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Muzaffarpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Patna hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Patna/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Purnia hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Purnia/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Saharsa hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Saharsa/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Samastipur hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Samastipur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Saran hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Saran/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Sitamarhi hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Sitamarhi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Supaul hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Supaul/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar Vaishali hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/Vaishali/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Bihar WestChamparan hf://datasets/ARTPARK-IISc/Vaani/audio/Bihar/WestChamparan/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chandigarh Chandigarh hf://datasets/ARTPARK-IISc/Vaani/audio/Chandigarh/Chandigarh/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Balrampur hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Balrampur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Bastar hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Bastar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Bilaspur hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Bilaspur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Jashpur hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Jashpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Kabirdham hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Kabirdham/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Korba hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Korba/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Raigarh hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Raigarh/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Rajnandgaon hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Rajnandgaon/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Sarguja hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Sarguja/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Chhattisgarh Sukma hf://datasets/ARTPARK-IISc/Vaani/audio/Chhattisgarh/Sukma/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Delhi NewDelhi hf://datasets/ARTPARK-IISc/Vaani/audio/Delhi/NewDelhi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Goa NorthSouthGoa hf://datasets/ARTPARK-IISc/Vaani/audio/Goa/NorthSouthGoa/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Jharkhand Deoghar hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Deoghar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Jharkhand Garhwa hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Garhwa/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Jharkhand Jamtara hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Jamtara/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Jharkhand Palamu hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Palamu/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Jharkhand Ranchi hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Ranchi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Jharkhand Sahebganj hf://datasets/ARTPARK-IISc/Vaani/audio/Jharkhand/Sahebganj/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Bangalore hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Bangalore/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Belgaum hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Belgaum/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Bellary hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Bellary/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Bidar hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Bidar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Bijapur hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Bijapur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Chamrajnagar hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Chamrajnagar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka DakshinKannada hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/DakshinKannada/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Dharwad hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Dharwad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Gulbarga hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Gulbarga/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Koppal hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Koppal/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Mysore hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Mysore/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Raichur hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Raichur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Karnataka Shimoga hf://datasets/ARTPARK-IISc/Vaani/audio/Karnataka/Shimoga/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "MadhyaPradesh Bhopal hf://datasets/ARTPARK-IISc/Vaani/audio/MadhyaPradesh/Bhopal/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "MadhyaPradesh Dhar hf://datasets/ARTPARK-IISc/Vaani/audio/MadhyaPradesh/Dhar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "MadhyaPradesh Katni hf://datasets/ARTPARK-IISc/Vaani/audio/MadhyaPradesh/Katni/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Maharashtra Aurangabad hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Aurangabad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Maharashtra Chandrapur hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Chandrapur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Maharashtra Dhule hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Dhule/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Maharashtra Nagpur hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Nagpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Maharashtra Pune hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Pune/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Maharashtra Sindhudurga hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Sindhudurga/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Maharashtra Solapur hf://datasets/ARTPARK-IISc/Vaani/audio/Maharashtra/Solapur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Meghalaya WestGaroHills hf://datasets/ARTPARK-IISc/Vaani/audio/Meghalaya/WestGaroHills/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Nagaland Dimapur hf://datasets/ARTPARK-IISc/Vaani/audio/Nagaland/Dimapur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Nagaland Kohima hf://datasets/ARTPARK-IISc/Vaani/audio/Nagaland/Kohima/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Odisha Khordha hf://datasets/ARTPARK-IISc/Vaani/audio/Odisha/Khordha/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Rajasthan Churu hf://datasets/ARTPARK-IISc/Vaani/audio/Rajasthan/Churu/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Rajasthan Jaipur hf://datasets/ARTPARK-IISc/Vaani/audio/Rajasthan/Jaipur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Rajasthan Nagaur hf://datasets/ARTPARK-IISc/Vaani/audio/Rajasthan/Nagaur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "TamilNadu Chennai hf://datasets/ARTPARK-IISc/Vaani/audio/TamilNadu/Chennai/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "TamilNadu Kanyakumari hf://datasets/ARTPARK-IISc/Vaani/audio/TamilNadu/Kanyakumari/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "TamilNadu Namakkal hf://datasets/ARTPARK-IISc/Vaani/audio/TamilNadu/Namakkal/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "TamilNadu Nilgiris hf://datasets/ARTPARK-IISc/Vaani/audio/TamilNadu/Nilgiris/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Telangana Hyderabad hf://datasets/ARTPARK-IISc/Vaani/audio/Telangana/Hyderabad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Telangana Karimnagar hf://datasets/ARTPARK-IISc/Vaani/audio/Telangana/Karimnagar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Telangana Mahabubabad hf://datasets/ARTPARK-IISc/Vaani/audio/Telangana/Mahabubabad/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Telangana Nalgonda hf://datasets/ARTPARK-IISc/Vaani/audio/Telangana/Nalgonda/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Tripura Dhalai hf://datasets/ARTPARK-IISc/Vaani/audio/Tripura/Dhalai/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Tripura Unakoti hf://datasets/ARTPARK-IISc/Vaani/audio/Tripura/Unakoti/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Tripura WestTripura hf://datasets/ARTPARK-IISc/Vaani/audio/Tripura/WestTripura/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Budaun hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Budaun/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Deoria hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Deoria/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Etah hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Etah/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Ghazipur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Ghazipur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Gorakhpur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Gorakhpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Hamirpur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Hamirpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Jalaun hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Jalaun/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh JyotibaPhuleNagar hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/JyotibaPhuleNagar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Lalitpur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Lalitpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Lucknow hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Lucknow/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Muzzaffarnagar hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Muzzaffarnagar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Saharanpur hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Saharanpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "UttarPradesh Varanasi hf://datasets/ARTPARK-IISc/Vaani/audio/UttarPradesh/Varanasi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Uttarakhand TehriGarhwal hf://datasets/ARTPARK-IISc/Vaani/audio/Uttarakhand/TehriGarhwal/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "Uttarakhand Uttarkashi hf://datasets/ARTPARK-IISc/Vaani/audio/Uttarakhand/Uttarkashi/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal Alipurduar hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Alipurduar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal CoochBehar hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/CoochBehar/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal DakshinDinajpur hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/DakshinDinajpur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal Darjeeling hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Darjeeling/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal Jalpaiguri hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Jalpaiguri/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal Jhargram hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Jhargram/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal Kolkata hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Kolkata/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal Malda hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Malda/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal North24Parganas hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/North24Parganas/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal PaschimMedinipur hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/PaschimMedinipur/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "WestBengal Purulia hf://datasets/ARTPARK-IISc/Vaani/audio/WestBengal/Purulia/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n", "images images hf://datasets/ARTPARK-IISc/Vaani/audio/images/images/train-*.parquet /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import polars as pl\n", "from tqdm import tqdm, trange\n", "import os\n", "\n", "def make_filter_df(state, district, url, savedir):\n", " df = pl.scan_parquet(url).with_columns(\n", " pl.col('audio').struct.field('path').alias('audio_path')\n", " ).drop('audio').collect(\n", " ).write_parquet(\n", " f\"{savedir}/{state}_{district}_meta.parquet\",\n", " compression=\"gzip\"\n", " )\n", " print(f\"✅ Saved {state}_{district}_meta.parquet\")\n", " \n", "savedir = \"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/MetaData\"\n", "os.makedirs(savedir, exist_ok=True)\n", "\n", "for row in tqdm(urls_df.iterrows(), total=len(urls_df), colour='yellow', ncols=70):\n", " state = row[1]['state']\n", " district = row[1]['district']\n", " url = row[1]['url']\n", " print(state, district, url, savedir)" ] }, { "cell_type": "code", "execution_count": 15, "id": "e88548fb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['AndhraPradesh_Anantpur',\n", " 'AndhraPradesh_Chittoor',\n", " 'AndhraPradesh_Guntur',\n", " 'AndhraPradesh_Krishna',\n", " 'AndhraPradesh_Srikakulam',\n", " 'ArunachalPradesh_Longding',\n", " 'ArunachalPradesh_PapumPare',\n", " 'Assam_KamrupMetropolitan',\n", " 'Assam_Sonitpur',\n", " 'Bihar_Araria']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "done = [\"_\".join(i.split(\".\")[:-1][0].split(\"_\")[:-1]) for i in sorted(os.listdir(savedir))]\n", "done" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.2" } }, "nbformat": 4, "nbformat_minor": 5 }