{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "from pyspark.sql import SparkSession\n", "import pandas as pd\n", "import polars as pl\n", "from tqdm.auto import tqdm, trange\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "\n", "SCRATCH = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani\"\n", "DATADIR = r\"/home/IITB/ai-at-ieor/23m1521/datasets/Vaani\"\n", "JSON_PATH = os.path.join(DATADIR, \"Vaani_IIsc_Artpark_Full_Data.json\")\n", "# IMAGES_PATH = os.path.join(SCRATCH, \"Images\")\n", "IMAGES_PARQUETS = os.path.join(SCRATCH, \"images_parquets\")\n", "AUDIO_URLS = \"audio_urls.txt\"\n", "IMAGES_URLS = \"images_urls.txt\"\n", "IMAGE_ROOT_URL = 'https://vaani.iisc.ac.in/'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2fd393fddd534c30ae2674438039ec69", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/9584932 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
image_namestatedistrictgenderaudio_languageaudio_name
0Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...MaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
1Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...MaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
2Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...MaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
3Images/IISc_VaaniProject_GENERIC_0073.jpgMaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
4Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...MaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
.....................
9584927Images/IISc_VaaniProject_GENERIC_0554.jpgKarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584928Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...KarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584929Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...KarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584930Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...KarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584931Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...KarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
\n", "

9584932 rows × 6 columns

\n", "" ], "text/plain": [ " image_name state \\\n", "0 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n", "1 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n", "2 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n", "3 Images/IISc_VaaniProject_GENERIC_0073.jpg Maharashtra \n", "4 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n", "... ... ... \n", "9584927 Images/IISc_VaaniProject_GENERIC_0554.jpg Karnataka \n", "9584928 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n", "9584929 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n", "9584930 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n", "9584931 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n", "\n", " district gender audio_language \\\n", "0 Aurangabad female Marathi \n", "1 Aurangabad female Marathi \n", "2 Aurangabad female Marathi \n", "3 Aurangabad female Marathi \n", "4 Aurangabad female Marathi \n", "... ... ... ... \n", "9584927 Chamarajanagar female Kannada \n", "9584928 Chamarajanagar female Kannada \n", "9584929 Chamarajanagar female Kannada \n", "9584930 Chamarajanagar female Kannada \n", "9584931 Chamarajanagar female Kannada \n", "\n", " audio_name \n", "0 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "1 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "2 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "3 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "4 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "... ... \n", "9584927 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "9584928 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "9584929 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "9584930 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "9584931 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "\n", "[9584932 rows x 6 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(data_dict)\n", "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(128807,)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.image_name.unique().shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "///////////////////////////////////////////////////////////////////////" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "81ecfc03838d46ac9870a5bc942607d7", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/9584932 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
image_namestatedistrictgenderaudio_languageaudio_name
0Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...MaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
1Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...MaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
2Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...MaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
3Images/IISc_VaaniProject_GENERIC_0073.jpgMaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
4Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...MaharashtraAurangabadfemaleMarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...
.....................
9584927Images/IISc_VaaniProject_GENERIC_0554.jpgKarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584928Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...KarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584929Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...KarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584930Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...KarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584931Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...KarnatakaChamarajanagarfemaleKannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
\n", "

9584932 rows × 6 columns

\n", "" ], "text/plain": [ " image_name state \\\n", "0 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n", "1 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n", "2 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n", "3 Images/IISc_VaaniProject_GENERIC_0073.jpg Maharashtra \n", "4 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n", "... ... ... \n", "9584927 Images/IISc_VaaniProject_GENERIC_0554.jpg Karnataka \n", "9584928 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n", "9584929 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n", "9584930 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n", "9584931 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n", "\n", " district gender audio_language \\\n", "0 Aurangabad female Marathi \n", "1 Aurangabad female Marathi \n", "2 Aurangabad female Marathi \n", "3 Aurangabad female Marathi \n", "4 Aurangabad female Marathi \n", "... ... ... ... \n", "9584927 Chamarajanagar female Kannada \n", "9584928 Chamarajanagar female Kannada \n", "9584929 Chamarajanagar female Kannada \n", "9584930 Chamarajanagar female Kannada \n", "9584931 Chamarajanagar female Kannada \n", "\n", " audio_name \n", "0 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "1 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "2 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "3 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "4 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n", "... ... \n", "9584927 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "9584928 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "9584929 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "9584930 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "9584931 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n", "\n", "[9584932 rows x 6 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(data_dict)\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# df.to_parquet('Vaani-Images-Audio-MetaData.parquet', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Final ImageBy Full Meta, Pending" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "# import pandas as pd\n", "import fireducks.pandas as pd\n", "from tqdm import tqdm, trange\n", "import matplotlib.pyplot as plt\n", "\n", "DATADIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/English\"\n", "IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\" \n", "FINAL_META = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/finalMETA.parquet\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfile_namefile_urlassertLanguagelanguagesSpokenstatedistrictgenderaudioFileNameimageFileNamepincodespeakerImageHash
02IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...Marathi[Marathi]MaharashtraAurangabadfemaleAudios/Aurangabad/IISc_VaaniProject_S_Maharash...Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...431105(iq~v-nq +lTC]QXDCSnJ2~23=+|Nq~nn(
13IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...Marathi[Marathi]MaharashtraAurangabadfemaleAudios/Aurangabad/IISc_VaaniProject_S_Maharash...Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...431105(iq~v-qq +lTC]QXDCSnJ2~23=+|Nq~miz
24IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...Marathi[Marathi]MaharashtraAurangabadfemaleAudios/Aurangabad/IISc_VaaniProject_S_Maharash...Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...431105(iq~v.pl +lTC]QXDCSnJ2~23=+|Nq~lo{
35IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...Marathi[Marathi]MaharashtraAurangabadfemaleAudios/Aurangabad/IISc_VaaniProject_S_Maharash...Images/IISc_VaaniProject_GENERIC_0073.jpg431105(iq~v+kl 1<0~A3:Aivx*
46IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...Marathi[Marathi]MaharashtraAurangabadfemaleAudios/Aurangabad/IISc_VaaniProject_S_Maharash...Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...431105(iq~w+rk +lTC]QXDCSnJ2~23=+|Nq(knx
.......................................
95849279696433IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...Kannada[Kannada, English]KarnatakaChamarajanagarfemaleAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...Images/IISc_VaaniProject_GENERIC_0554.jpg571440(oi~r)iivq )~=/I+|Nq,nm
95849289696434IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...Kannada[Kannada, English]KarnatakaChamarajanagarfemaleAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...571440(oi{t(iivq |JPWiCL]K^CTs=G~|80@|Avq)mp
95849299696435IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...Kannada[Kannada, English]KarnatakaChamarajanagarfemaleAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...571440(kl}q-iivq |JPWiCL]K^CTs=G~|80@|Avr(kr
95849309696436IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...Kannada[Kannada, English]KarnatakaChamarajanagarfemaleAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...571440~rk}s+iivq |JPWiCL]K^CTs=G~|80@|Avq*km
95849319696437IISc_VaaniProject_M_KA_Chamrajn_42017276_15323...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...Kannada[Kannada, English]KarnatakaChamarajanagarfemaleAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...571440(nlxt+iivq |JPWiCL]K^CTs=G~|80@|Avr,lq
\n", "

9584932 rows x 12 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "finalMETA = pd.read_parquet(FINAL_META)\n", "finalMETA" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand',\n", " 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal',\n", " 'Rajasthan', 'Uttarakhand', 'Goa'], dtype=object)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "finalMETA.state.unique()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Anantpur',\n", " 'Araria',\n", " 'Aurangabad',\n", " 'Balrampur',\n", " 'Bastar',\n", " 'Begusarai',\n", " 'Belgaum',\n", " 'Bellary',\n", " 'Bhagalpur',\n", " 'Bijapur',\n", " 'Bilaspur',\n", " 'Budaun',\n", " 'Chamarajanagar',\n", " 'Chandrapur',\n", " 'Chittoor',\n", " 'Churu',\n", " 'DakshinDinajpur',\n", " 'DakshinaKannada',\n", " 'Darbhanga',\n", " 'Deoria',\n", " 'Dharwad',\n", " 'Dhule',\n", " 'EastChamparan',\n", " 'Etah',\n", " 'Gaya',\n", " 'Ghazipur',\n", " 'Gopalganj',\n", " 'Gorakhpur',\n", " 'Gulbarga',\n", " 'Guntur',\n", " 'Hamirpur',\n", " 'Jahanabad',\n", " 'Jalaun',\n", " 'Jalpaiguri',\n", " 'Jamtara',\n", " 'Jamui',\n", " 'Jashpur',\n", " 'Jhargram',\n", " 'JyotibaPhuleNagar',\n", " 'Kabirdham',\n", " 'Karimnagar',\n", " 'Kishanganj',\n", " 'Kolkata',\n", " 'Korba',\n", " 'Krishna',\n", " 'Lakhisarai',\n", " 'Madhepura',\n", " 'Malda',\n", " 'Muzaffarnagar',\n", " 'Muzaffarpur',\n", " 'Mysore',\n", " 'Nagaur',\n", " 'Nagpur',\n", " 'Nalgonda',\n", " 'North24Parganas',\n", " 'NorthSouthGoa',\n", " 'PaschimMedinipur',\n", " 'Pune',\n", " 'Purnia',\n", " 'Purulia',\n", " 'Raichur',\n", " 'Raigarh',\n", " 'Rajnandgaon',\n", " 'Saharsa',\n", " 'Sahebganj',\n", " 'Samastipur',\n", " 'Saran',\n", " 'Sarguja',\n", " 'Shimoga',\n", " 'Sindhudurg',\n", " 'Sitamarhi',\n", " 'Solapur',\n", " 'Srikakulam',\n", " 'Sukma',\n", " 'Supaul',\n", " 'TehriGarhwal',\n", " 'Uttarkashi',\n", " 'Vaishali',\n", " 'Varanasi',\n", " 'Vishakapattanam']" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(finalMETA.district.unique())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "128807" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(os.listdir(IMAGEDIR))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(128807,)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "finalMETA.imageFileName.unique().shape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(54,)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "finalMETA.assertLanguage.unique().shape" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|\u001b[32m█████████████████████████\u001b[0m| 128807/128807 [29:41<00:00, 72.31it/s]\u001b[0m\n" ] } ], "source": [ "import csv\n", "from tqdm import tqdm, trange\n", "\n", "\n", "language_columns = sorted(finalMETA.assertLanguage.dropna().unique())\n", "with open('imageBY3.csv', mode='w', newline='', encoding='utf-8') as file:\n", " writer = csv.writer(file)\n", " writer.writerow(\n", " [\"id\", \"imageFileName\", \"audioCounts\", \"total\",\n", " \"state\", \"district\", \"assertLanguage\", \"audio_urls\"] \n", " + language_columns\n", " )\n", "\n", " total = 0\n", " write_rows_list = []\n", "\n", " pbar = tqdm(\n", " finalMETA.groupby('imageFileName'),\n", " colour=\"green\", ncols=70,\n", " total=finalMETA['imageFileName'].nunique()\n", " )\n", "\n", " for i, (image, data) in enumerate(pbar):\n", " # if i == 3:\n", " # break\n", " assert_Languages = data['assertLanguage'].unique()\n", " assert_Languages_url_dict = data.set_index('file_url')['assertLanguage'].to_dict()\n", " audio_urls_dict = {lang: [url for url, v in assert_Languages_url_dict.items() if v == lang] for lang in assert_Languages}\n", " \n", " # state = [i.strip() for i in data['state'].unique()]\n", " # district = [i.strip() for i in data['district'].unique()]\n", " state = data['state'].unique().tolist()\n", " district = data['district'].unique().tolist()\n", " \n", " audioCounts = data['assertLanguage'].count()\n", " total += audioCounts\n", " lang_counts = data['assertLanguage'].value_counts().to_dict()\n", " \n", " lang_row = [int(lang in lang_counts) for lang in language_columns]\n", "\n", " write_rows_list.append(\n", " [i + 1, image, audioCounts, total, state, district, lang_counts, audio_urls_dict] \n", " + lang_row\n", " )\n", "\n", " if i % 1000 == 0 and i > 0:\n", " writer.writerows(write_rows_list)\n", " file.flush()\n", " write_rows_list = []\n", " \n", " if write_rows_list:\n", " writer.writerows(write_rows_list)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idimageFileNameaudioCountstotalstatedistrictassertLanguageaudio_urlsAgariyaAngika...SadriSantaliShekhawatiSurgujiaSurjapuriTamilTeluguTuluUrduWagdi
01Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...5252['AndhraPradesh']['Anantpur']{'Telugu': 45, 'Hindi': 7}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
12Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...4193['AndhraPradesh']['Anantpur']{'Telugu': 35, 'Hindi': 6}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
23Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...53146['AndhraPradesh']['Anantpur']{'Telugu': 45, 'Hindi': 6, 'Bengali': 2}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
34Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...38184['AndhraPradesh']['Anantpur']{'Telugu': 32, 'Hindi': 5, 'Urdu': 1}{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...00...0000001010
45Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...57241['AndhraPradesh']['Anantpur']{'Telugu': 48, 'Hindi': 7, 'Urdu': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...00...0000001010
..................................................................
128802128803Images/IISc_VaaniProject_Vishakapattanam-SPECI...359584829['AndhraPradesh']['Vishakapattanam']{'Telugu': 35}{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...00...0000001000
128803128804Images/IISc_VaaniProject_Vishakapattanam-SPECI...199584848['AndhraPradesh']['Vishakapattanam']{'Telugu': 16, 'Hindi': 3}{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...00...0000001000
128804128805Images/IISc_VaaniProject_Vishakapattanam-SPECI...179584865['AndhraPradesh']['Vishakapattanam']{'Telugu': 15, 'Hindi': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...00...0000001000
128805128806Images/IISc_VaaniProject_Vishakapattanam-SPECI...319584896['AndhraPradesh']['Vishakapattanam']{'Telugu': 31}{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...00...0000001000
128806128807Images/IISc_VaaniProject_Vishakapattanam-SPECI...369584932['AndhraPradesh']['Vishakapattanam']{'Telugu': 35, 'Hindi': 1}{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...00...0000001000
\n", "

128807 rows x 62 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "imageBY = pd.read_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/imageBY3.csv\")\n", "\n", "imageBY" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([\"['AndhraPradesh']\", \"['Bihar']\", \"['Maharashtra']\",\n", " \"['Chhattisgarh']\", \"['Karnataka']\", \"['UttarPradesh']\",\n", " \"['Rajasthan']\", \"['WestBengal']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'WestBengal', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa', 'Rajasthan', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Bihar', 'Jharkhand', 'Goa', 'Rajasthan', 'Uttarakhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'Maharashtra', 'WestBengal', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Goa', 'Uttarakhand', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Goa']\",\n", " \"['Karnataka', 'Rajasthan', 'AndhraPradesh', 'Bihar', 'UttarPradesh', 'Maharashtra', 'WestBengal', 'Uttarakhand', 'Chhattisgarh', 'Jharkhand', 'Goa', 'Telangana']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa', 'Maharashtra']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Jharkhand', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa', 'Maharashtra']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Goa', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'WestBengal', 'Jharkhand', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'Jharkhand', 'WestBengal']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Goa', 'Bihar', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Uttarakhand']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Bihar', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Bihar', 'Rajasthan', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Maharashtra', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa', 'Uttarakhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Goa', 'Rajasthan', 'WestBengal', 'Uttarakhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'WestBengal', 'Uttarakhand', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Goa', 'Rajasthan', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'WestBengal', 'Goa', 'Jharkhand', 'Rajasthan', 'Uttarakhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'UttarPradesh', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Maharashtra', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Goa', 'Jharkhand', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'WestBengal', 'Maharashtra', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Rajasthan', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Goa', 'Jharkhand', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'Rajasthan', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'WestBengal', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Goa', 'Rajasthan', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Maharashtra']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Goa', 'Jharkhand']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Goa', 'Rajasthan', 'WestBengal']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Bihar', 'Rajasthan', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Goa', 'Rajasthan', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'Uttarakhand', 'Bihar', 'UttarPradesh', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Goa', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Jharkhand', 'UttarPradesh', 'Goa', 'Rajasthan', 'WestBengal']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Rajasthan', 'WestBengal', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'WestBengal', 'Goa', 'Rajasthan', 'UttarPradesh']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Jharkhand', 'WestBengal', 'Rajasthan', 'UttarPradesh', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Karnataka', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Jharkhand', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Bihar', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa', 'Jharkhand', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa', 'WestBengal', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Goa', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'Goa', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa', 'UttarPradesh', 'Uttarakhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Maharashtra', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Goa', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'WestBengal', 'Rajasthan', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Rajasthan', 'Goa', 'Uttarakhand', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Goa', 'Bihar', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'WestBengal', 'Rajasthan']\",\n", " \"['Karnataka', 'Rajasthan', 'AndhraPradesh', 'Bihar', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Chhattisgarh', 'Goa', 'Jharkhand', 'Telangana']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Goa', 'Rajasthan', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Rajasthan', 'Goa', 'WestBengal', 'Uttarakhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Goa', 'WestBengal', 'Uttarakhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'Bihar', 'WestBengal', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Goa', 'Jharkhand', 'WestBengal', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Maharashtra', 'WestBengal', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Rajasthan', 'WestBengal']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Goa', 'Maharashtra', 'Jharkhand']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa', 'WestBengal']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'Maharashtra', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'UttarPradesh', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Goa', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'WestBengal', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Telangana', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Goa', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Maharashtra', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Telangana', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Bihar', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Goa', 'Jharkhand']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Bihar', 'Jharkhand', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa', 'Rajasthan', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Maharashtra', 'Uttarakhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Jharkhand', 'Telangana', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'Jharkhand', 'WestBengal', 'Uttarakhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Telangana', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Telangana', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Goa', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Maharashtra']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Goa', 'Jharkhand', 'Rajasthan', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Telangana', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Goa', 'Jharkhand', 'Rajasthan', 'Uttarakhand']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'WestBengal', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Bihar', 'WestBengal', 'Jharkhand', 'Rajasthan']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Jharkhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'UttarPradesh', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Bihar', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'Goa', 'Jharkhand', 'WestBengal']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Goa', 'Rajasthan', 'UttarPradesh', 'WestBengal']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Jharkhand', 'Maharashtra', 'Uttarakhand', 'WestBengal', 'Goa']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Bihar', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Goa', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'WestBengal', 'Bihar', 'Jharkhand', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa', 'Jharkhand']\",\n", " \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Bihar', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Rajasthan', 'Jharkhand', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'WestBengal', 'Jharkhand']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Goa', 'Jharkhand', 'Rajasthan', 'WestBengal']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Goa', 'Rajasthan']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Maharashtra', 'Goa']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Goa', 'WestBengal', 'Rajasthan']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa', 'WestBengal']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Jharkhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'Goa', 'WestBengal']\",\n", " \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Rajasthan', 'Goa', 'WestBengal', 'Uttarakhand', 'Jharkhand']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'Goa', 'WestBengal', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'Jharkhand', 'WestBengal', 'Uttarakhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'Jharkhand', 'Uttarakhand', 'WestBengal']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'Uttarakhand', 'WestBengal', 'Jharkhand']\",\n", " \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Goa']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand']\",\n", " \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Goa', 'Rajasthan', 'Jharkhand']\",\n", " \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'Goa', 'Jharkhand', 'WestBengal']\",\n", " \"['Jharkhand']\", \"['Telangana']\", \"['Goa']\",\n", " \"['Bihar', 'Uttarakhand']\", \"['Uttarakhand']\"], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imageBY.state.unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "({1}, {1})" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# state_len = set()\n", "# district_len = set()\n", "# for state, district in zip(imageBY.state.values, imageBY.district.values):\n", "# state = eval(state)\n", "# district = eval(district)\n", "# state_len.add(len(state))\n", "# district_len.add(len(district))\n", "# state_len, district_len" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# imageBY.state = imageBY.state.apply(lambda x: eval(x)[0])\n", "# imageBY.district = imageBY.district.apply(lambda x: eval(x)[0])\n", "# imageBY.to_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/imageBY3.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idimageFileNameaudioCountstotalstatedistrictassertLanguageaudio_urlsAgariyaAngika...SadriSantaliShekhawatiSurgujiaSurjapuriTamilTeluguTuluUrduWagdi
01Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...5252['AndhraPradesh']['Anantpur']{'Telugu': 45, 'Hindi': 7}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
12Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...4193['AndhraPradesh']['Anantpur']{'Telugu': 35, 'Hindi': 6}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
23Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...53146['AndhraPradesh']['Anantpur']{'Telugu': 45, 'Hindi': 6, 'Bengali': 2}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
34Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...38184['AndhraPradesh']['Anantpur']{'Telugu': 32, 'Hindi': 5, 'Urdu': 1}{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...00...0000001010
45Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...57241['AndhraPradesh']['Anantpur']{'Telugu': 48, 'Hindi': 7, 'Urdu': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...00...0000001010
..................................................................
128798128799Images/IISc_VaaniProject_Vishakapattanam-SPECI...329584687['AndhraPradesh']['Vishakapattanam']{'Telugu': 29, 'Hindi': 3}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...00...0000001000
128799128800Images/IISc_VaaniProject_Vishakapattanam-SPECI...359584722['AndhraPradesh']['Vishakapattanam']{'Telugu': 33, 'Hindi': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...00...0000001000
128803128804Images/IISc_VaaniProject_Vishakapattanam-SPECI...199584848['AndhraPradesh']['Vishakapattanam']{'Telugu': 16, 'Hindi': 3}{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...00...0000001000
128804128805Images/IISc_VaaniProject_Vishakapattanam-SPECI...179584865['AndhraPradesh']['Vishakapattanam']{'Telugu': 15, 'Hindi': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...00...0000001000
128806128807Images/IISc_VaaniProject_Vishakapattanam-SPECI...369584932['AndhraPradesh']['Vishakapattanam']{'Telugu': 35, 'Hindi': 1}{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...00...0000001000
\n", "

105940 rows x 62 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "imageBY.loc[imageBY.Hindi == 1]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(82.24708284487645, 0.3633342908382308, 81.99010923319385)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Hindi = 105940*100/128807\n", "English = 468*100/128807\n", "HindiNotEnglish = 105609*100/128807\n", "Hindi, English, HindiNotEnglish" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idimageFileNameaudioCountstotalstatedistrictassertLanguageaudio_urlsAgariyaAngika...SadriSantaliShekhawatiSurgujiaSurjapuriTamilTeluguTuluUrduWagdi
01Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...5252['AndhraPradesh']['Anantpur']{'Telugu': 45, 'Hindi': 7}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
12Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...4193['AndhraPradesh']['Anantpur']{'Telugu': 35, 'Hindi': 6}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
23Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...53146['AndhraPradesh']['Anantpur']{'Telugu': 45, 'Hindi': 6, 'Bengali': 2}{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...00...0000001000
34Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...38184['AndhraPradesh']['Anantpur']{'Telugu': 32, 'Hindi': 5, 'Urdu': 1}{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...00...0000001010
45Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...57241['AndhraPradesh']['Anantpur']{'Telugu': 48, 'Hindi': 7, 'Urdu': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...00...0000001010
..................................................................
105604128799Images/IISc_VaaniProject_Vishakapattanam-SPECI...329584687['AndhraPradesh']['Vishakapattanam']{'Telugu': 29, 'Hindi': 3}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...00...0000001000
105605128800Images/IISc_VaaniProject_Vishakapattanam-SPECI...359584722['AndhraPradesh']['Vishakapattanam']{'Telugu': 33, 'Hindi': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...00...0000001000
105606128804Images/IISc_VaaniProject_Vishakapattanam-SPECI...199584848['AndhraPradesh']['Vishakapattanam']{'Telugu': 16, 'Hindi': 3}{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...00...0000001000
105607128805Images/IISc_VaaniProject_Vishakapattanam-SPECI...179584865['AndhraPradesh']['Vishakapattanam']{'Telugu': 15, 'Hindi': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...00...0000001000
105608128807Images/IISc_VaaniProject_Vishakapattanam-SPECI...369584932['AndhraPradesh']['Vishakapattanam']{'Telugu': 35, 'Hindi': 1}{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...00...0000001000
\n", "

105609 rows x 62 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HindiNotEnglish_df = imageBY.loc[(imageBY.English != 1) & (imageBY.Hindi == 1)].reset_index(drop=True)\n", "HindiNotEnglish_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Hindi_district = [\n", " 'Delhi_NewDelhi', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Katni',\n", " 'Chhattisgarh_Bilaspur', 'Maharashtra_Nagpur', 'UttarPradesh_Varanasi', \n", " 'UttarPradesh_Lucknow', 'UttarPradesh_Gorakhpur'\n", "]\n", "Hindi_district = [i.split(\"_\")[-1] for i in Hindi_district]\n", "print(f\"['{Hindi_district[1]}']\")\n", "\n", "for i, row in tqdm(HindiNotEnglish_df.iterrows(), colour='blue', total=HindiNotEnglish_df.shape[0], ncols=70):\n", " if i==1000:continue\n", " row_districts = eval(row['district'])\n", " print(Hindi_district[0], row_districts, Hindi_district[0] in row_districts)\n", " # if Hindi_district[0] in row_districts:\n", " # print(row)\n", " # break\n", " " ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfile_namefile_urlmetadata
02IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
13IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
24IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
35IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
46IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
...............
95849279696433IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
95849289696434IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
95849299696435IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
95849309696436IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
95849319696437IISc_VaaniProject_M_KA_Chamrajn_42017276_15323...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
\n", "

9584932 rows × 4 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "JSON_PATH = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Vaani_IIsc_Artpark_Full_Data.json\"\n", "\n", "jsondf = pd.read_json(JSON_PATH)\n", "jsondf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "metadata_cols = [\n", " 'assertLanguage',\n", " 'languagesSpoken',\n", " 'state',\n", " 'district',\n", " 'gender',\n", " 'audioFileName',\n", " 'imageFileName',\n", " 'pincode',\n", " 'speakerImageHash',\n", " ]\n", "\n", "meta_df = pd.json_normalize(jsondf['metadata'])\n", "meta_df = meta_df[metadata_cols]\n", "finalMETA = pd.concat([jsondf.drop(columns=['metadata']), meta_df], axis=1)\n", "finalMETA" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Parquet',\n", " 'English',\n", " 'train-00000-of-00057.parquet',\n", " 'NewDelhi_train-00000-of-00054.parquet']" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "os.listdir(\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
audiolanguagelanguagesKnowngenderstatedistrictpincodestay(years)isTranscriptionAvailabletranscriptreferenceImage
0{'bytes': b'RIFF\\xd2\\x04\\x01\\x00WAVEfmt \\x10\\x...Hindi['Hindi']FemaleDelhiNewDelhi110004NewDelhi(20)NoNoneImages/IISc_VaaniProject_GENERIC_0473.jpg
1{'bytes': b'RIFF\\xb0h\\x01\\x00WAVEfmt \\x10\\x00\\...Hindi['Hindi']FemaleDelhiNewDelhi110001NewDelhi(19)NoNoneImages/IISc_VaaniProject_GENERIC_1011.jpg
2{'bytes': b'RIFF\\x84+\\x01\\x00WAVEfmt \\x10\\x00\\...Hindi['Hindi']FemaleDelhiNewDelhi110067NewDelhi(11)NoNoneImages/IISc_VaaniProject_NewDelhi-SPECIFIC_015...
3{'bytes': b'RIFF2\\xd3\\x01\\x00WAVEfmt \\x10\\x00\\...Hindi['Hindi']MaleDelhiNewDelhi110001NewDelhi(24)Yesऐच_डी_ऐफ_सी बैंक {H_D_F_C bank} का और उसमे एक ...Images/IISc_VaaniProject_GENERIC_0418.jpg
4{'bytes': b'RIFF\\xe45\\x01\\x00WAVEfmt \\x10\\x00\\...Hindi['Hindi']MaleDelhiNewDelhi110023NewDelhi(20)NoNoneImages/IISc_VaaniProject_GENERIC_0851.jpg
....................................
3035{'bytes': b'RIFF\\xbch\\x02\\x00WAVEfmt \\x10\\x00\\...Hindi['Hindi']FemaleDelhiNewDelhi110038NewDelhi(20)NoNoneImages/IISc_VaaniProject_GENERIC_0193.jpg
3036{'bytes': b'RIFFJD\\x01\\x00WAVEfmt \\x10\\x00\\x00...Hindi['Hindi']FemaleDelhiNewDelhi110028NewDelhi(20)NoNoneImages/IISc_VaaniProject_GENERIC_0249.jpg
3037{'bytes': b'RIFF\\x12\\x02\\x01\\x00WAVEfmt \\x10\\x...Hindi['Hindi']FemaleDelhiNewDelhi110023NewDelhi(20)NoNoneImages/IISc_VaaniProject_GENERIC_1268.jpg
3038{'bytes': b'RIFFz\\xbc\\x01\\x00WAVEfmt \\x10\\x00\\...Hindi['Hindi']FemaleDelhiNewDelhi110011NewDelhi(23)NoNoneImages/IISc_VaaniProject_NewDelhi-SPECIFIC_016...
3039{'bytes': b'RIFF\\xc8\\xa4\\x01\\x00WAVEfmt \\x10\\x...Hindi['Hindi']FemaleDelhiNewDelhi110011NewDelhi(23)NoNoneImages/IISc_VaaniProject_NewDelhi-SPECIFIC_012...
\n", "

3040 rows x 11 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "parquet_df = pd.read_parquet(\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/NewDelhi_train-00000-of-00054.parquet\")\n", "parquet_df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Images/IISc_VaaniProject_Anantpur-SPECIFIC_00001.jpg'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HindiNotEnglish_df.imageFileName[0]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfile_namefile_urlassertLanguagelanguagesSpokenstatedistrictgenderaudioFileNameimageFileNamepincodespeakerImageHash
135079135081IISc_VaaniProject_S_AP_Anantpur_113390_1167518...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu, Hindi]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515556(jo}v(qk +eCPcZlTfB:<|+53:Aivq~j
135088135090IISc_VaaniProject_S_AP_Anantpur_113390_1167518...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu, Hindi]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515556(jo}v(qk +eCPcZlTfB:<|+53:Aivq~j
408373408532IISc_VaaniProject_S_AP_Anantpur_94940_10873932...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Hindi[Hindi, Urdu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515763(iq}t0lk +eCPcZlTfB:<|+53:Aivq~j
792425792662IISc_VaaniProject_S_AP_Anantpur_112124_1158582...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515672(jn~v/km +eCPcZlTfB:<|+53:Aivq~j
792436792673IISc_VaaniProject_S_AP_Anantpur_112124_1158582...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515672(jn~v/km +eCPcZlTfB:<|+53:Aivq~j
10518901052127IISc_VaaniProject_S_AP_Anantpur_107463_1139557...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515134(jl(v,pq +eCPcZlTfB:<|+53:Aivq~j
10519021052139IISc_VaaniProject_S_AP_Anantpur_107463_1139557...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515134(jl(v,pq +eCPcZlTfB:<|+53:Aivq~j
11533171153554IISc_VaaniProject_S_AP_Anantpur_104144_1121525...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515812(jkwv)nk +eCPcZlTfB:<|+53:Aivq~j
11533481153585IISc_VaaniProject_S_AP_Anantpur_104144_1121525...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515812(jkwv)nk +eCPcZlTfB:<|+53:Aivq~j
14533231453709IISc_VaaniProject_S_AP_Anantpur_121072_1220106...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Hindi[English, Hindi]AndhraPradeshAnantpurmaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515133(kkvr~ok +eCPcZlTfB:<|+53:Aivq~j
14534201453806IISc_VaaniProject_S_AP_Anantpur_121072_1220106...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Hindi[English, Hindi]AndhraPradeshAnantpurmaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515133(kkvr~ok +eCPcZlTfB:<|+53:Aivq~j
14558741456260IISc_VaaniProject_S_AP_Anantpur_116830_1187130...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Hindi, Telugu]AndhraPradeshAnantpurmaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515631(jq}r*im +eCPcZlTfB:<|+53:Aivq~j
14560821456468IISc_VaaniProject_S_AP_Anantpur_116830_1187130...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Hindi, Telugu]AndhraPradeshAnantpurmaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515631(jq}r*im +eCPcZlTfB:<|+53:Aivq~j
40830914084419IISc_VaaniProject_S_AP_Anantpur_155564_1387714...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Hindi[Hindi, Malayalam]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515401(lq}x(mo +eCPcZlTfB:<|+53:Aivq~j
40831334084461IISc_VaaniProject_S_AP_Anantpur_155564_1387714...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Hindi[Hindi, Malayalam]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515401(lq}x(mo +eCPcZlTfB:<|+53:Aivq~j
40840384085366IISc_VaaniProject_S_AP_Anantpur_118450_1192469...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Hindi[Hindi]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515867(jrxu-ro +eCPcZlTfB:<|+53:Aivq~j
40840724085400IISc_VaaniProject_S_AP_Anantpur_118450_1192469...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Hindi[Hindi]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515867(jrxu-ro +eCPcZlTfB:<|+53:Aivq~j
49104404913075IISc_VaaniProject_M_AP_Anantpur_Pras22443_1156...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu, Hindi, Kannada]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515154(jn|v,iivq zPPXkRWanJ2~23=+|Nq~iiw
49104754913110IISc_VaaniProject_M_AP_Anantpur_Pras22443_1156...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu, Hindi, Kannada]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515154(jn|v,iivq zPPXkRWanJ2~23=+|Nq~iiw
49155154918150IISc_VaaniProject_M_AP_Anantpur_Push20666_1748...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515812(pm~u*iivq zPPXkRWanJ2~23=+|Nq~iiw
49155294918164IISc_VaaniProject_M_AP_Anantpur_Push20666_1748...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515812(pm~u*iivq zPPXkRWanJ2~23=+|Nq~iiw
49161774918812IISc_VaaniProject_M_AP_Anantpur_Chan92399_0731...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515671~plwt.iivq zPPXkRWanJ2~23=+|Nq~iiw
49164844919119IISc_VaaniProject_M_AP_Anantpur_Chan92399_0731...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515671~plwt.iivq zPPXkRWanJ2~23=+|Nq~iiw
49166714919306IISc_VaaniProject_M_AP_Anantpur_Chan92399_0731...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515671~plwt.iivq zPPXkRWanJ2~23=+|Nq~iiw
49166874919322IISc_VaaniProject_M_AP_Anantpur_Chan92399_0731...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurfemaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515671~plwt.iivq zPPXkRWanJ2~23=+|Nq~iiw
49493294951964IISc_VaaniProject_M_AP_Anantpur_Nare62582_1417...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurmaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515671(mj}u0iivq zPPXkRWanJ2~23=+|Nq~iiw
49494014952036IISc_VaaniProject_M_AP_Anantpur_Nare62582_1417...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurmaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515671(mj}u0iivq zPPXkRWanJ2~23=+|Nq~iiw
49495374952172IISc_VaaniProject_M_AP_Anantpur_Nare62582_1417...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurmaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515671(mj}u0iivq zPPXkRWanJ2~23=+|Nq~iiw
49721814974816IISc_VaaniProject_M_AP_Anantpur_Prud35888_1142...https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...Telugu[Telugu]AndhraPradeshAnantpurmaleAudios/Anantpur/IISc_VaaniProject_M_AP_Anantpu...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515556(jmxu*iivq zPPXkRWanJ2~23=+|Nq~iiw
60256396137143IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515435~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw
60257326137236IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515435~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw
60257396137243IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515435~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw
60257446137248IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515435~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw
60257616137265IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515435~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw
63821996493703IISc_VaaniProject_M_AP_Anantpur_Pill40436_0912...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515501~rjxv~iivq zPPXkRWanJ2~23=+|Nq~iiw
63822006493704IISc_VaaniProject_M_AP_Anantpur_Pill40436_0912...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515501~rjxv~iivq zPPXkRWanJ2~23=+|Nq~iiw
63822086493712IISc_VaaniProject_M_AP_Anantpur_Pill40436_0912...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515501~rjxv~iivq zPPXkRWanJ2~23=+|Nq~iiw
65703116681815IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, Hindi, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515261(oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw
65703166681820IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, Hindi, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515261(oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw
65703176681821IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, Hindi, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515261(oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw
65703196681823IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, Hindi, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515261(oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw
65703236681827IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, Hindi, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515261(oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw
65703296681833IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, Hindi, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515261(oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw
65703306681834IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, Hindi, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515261(oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw
73856707497174IISc_VaaniProject_M_AP_Anantpur_BODA75520_1956...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515425(rn|s+iivq zPPXkRWanJ2~23=+|Nq~iiw
73857137497217IISc_VaaniProject_M_AP_Anantpur_BODA75520_1956...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu, English]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515425(rn|s+iivq zPPXkRWanJ2~23=+|Nq~iiw
76851787796682IISc_VaaniProject_S_AP_Anantpur_119578_1197153...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Hindi, Telugu]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515281(jr}r,lm +eCPcZlTfB:<|+53:Aivq~j
77171907828694IISc_VaaniProject_S_AP_Anantpur_119578_1197153...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Hindi, Telugu]AndhraPradeshAnantpurmale/Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515281(jr}r,lm +eCPcZlTfB:<|+53:Aivq~j
92386209350126IISc_VaaniProject_S_AP_Anantpur_109401_1144986...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515001(jmzz/oi +eCPcZlTfB:<|+53:Aivq~j
92386319350137IISc_VaaniProject_S_AP_Anantpur_109401_1144986...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515001(jmzz/oi +eCPcZlTfB:<|+53:Aivq~j
92390769350582IISc_VaaniProject_S_AP_Anantpur_88704_10558507...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515425(in{y,ip +eCPcZlTfB:<|+53:Aivq~j
92390789350584IISc_VaaniProject_S_AP_Anantpur_88704_10558507...https://vaani.iisc.ac.in//Audios/Anantpur/IISc...Telugu[Telugu]AndhraPradeshAnantpurfemale/Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp...Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...515425(in{y,ip +eCPcZlTfB:<|+53:Aivq~j
\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "finalMETA.loc[finalMETA.imageFileName == HindiNotEnglish_df.imageFileName[0]]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idimageFileNameaudioCountstotalstatedistrictassertLanguageaudio_urlsAgariyaAngika...SadriSantaliShekhawatiSurgujiaSurjapuriTamilTeluguTuluUrduWagdi
1052910530Images/IISc_VaaniProject_Bellary-SPECIFIC_0023...46417891['Karnataka']['Bellary']{'Kannada': 40, 'Hindi': 2, 'Telugu': 2, 'Engl...{'Hindi': ['https://vaani.iisc.ac.in/Audios/Be...00...0000001000
1058010581Images/IISc_VaaniProject_Bellary-SPECIFIC_0028...28419513['Karnataka']['Bellary']{'Kannada': 17, 'Telugu': 7, 'Hindi': 2, 'Engl...{'Kannada': ['https://vaani.iisc.ac.in/Audios/...00...0000001000
1085510856Images/IISc_VaaniProject_Bellary-SPECIFIC_0055...40428720['Karnataka']['Bellary']{'Kannada': 25, 'Telugu': 11, 'English': 2, 'H...{'Kannada': ['https://vaani.iisc.ac.in/Audios/...00...0000001000
1086210863Images/IISc_VaaniProject_Bellary-SPECIFIC_0056...37428927['Karnataka']['Bellary']{'Kannada': 22, 'Telugu': 11, 'Hindi': 2, 'Eng...{'Telugu': ['https://vaani.iisc.ac.in/Audios/B...00...0000001000
1094210943Images/IISc_VaaniProject_Bellary-SPECIFIC_0064...39431624['Karnataka']['Bellary']{'Kannada': 31, 'Telugu': 4, 'English': 2, 'Hi...{'Kannada': ['https://vaani.iisc.ac.in/Audios/...00...0000001000
..................................................................
7086470865Images/IISc_VaaniProject_Krishna-SPECIFIC_0029...287336979['AndhraPradesh']['Krishna']{'Telugu': 21, 'Hindi': 4, 'English': 3}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr...00...0000001000
7115371154Images/IISc_VaaniProject_Krishna-SPECIFIC_0057...347346921['AndhraPradesh']['Krishna']{'Telugu': 29, 'Hindi': 3, 'English': 2}{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...00...0000001000
7116171162Images/IISc_VaaniProject_Krishna-SPECIFIC_0058...397347205['AndhraPradesh']['Krishna']{'Telugu': 32, 'Hindi': 5, 'English': 2}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr...00...0000001000
7166571666Images/IISc_VaaniProject_Krishna-SPECIFIC_0109...417364827['AndhraPradesh']['Krishna']{'Telugu': 36, 'English': 3, 'Hindi': 2}{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...00...0000001000
7172971730Images/IISc_VaaniProject_Krishna-SPECIFIC_0115...327367046['AndhraPradesh']['Krishna']{'Telugu': 25, 'Hindi': 4, 'English': 3}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr...00...0000001000
\n", "

331 rows x 62 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "imageBY.loc[(imageBY.English == 1) & (imageBY.Hindi == 1)]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idimageFileNameaudioCountstotalstatedistrictassertLanguageaudio_urlsAgariyaAngika...SadriSantaliShekhawatiSurgujiaSurjapuriTamilTeluguTuluUrduWagdi
1030910310Images/IISc_VaaniProject_Bellary-SPECIFIC_0001...41410798['Karnataka']['Bellary']{'Kannada': 20, 'Telugu': 13, 'Urdu': 4, 'Bear...{'Telugu': ['https://vaani.iisc.ac.in/Audios/B...00...0000001010
1032210323Images/IISc_VaaniProject_Bellary-SPECIFIC_0002...33411256['Karnataka']['Bellary']{'Kannada': 25, 'Telugu': 5, 'English': 2, 'Be...{'Kannada': ['https://vaani.iisc.ac.in/Audios/...00...0000001000
1034110342Images/IISc_VaaniProject_Bellary-SPECIFIC_0004...40411887['Karnataka']['Bellary']{'Kannada': 33, 'Telugu': 5, 'English': 2}{'Kannada': ['https://vaani.iisc.ac.in/Audios/...00...0000001000
1039410395Images/IISc_VaaniProject_Bellary-SPECIFIC_0009...34413709['Karnataka']['Bellary']{'Kannada': 28, 'Telugu': 4, 'English': 2}{'Kannada': ['https://vaani.iisc.ac.in/Audios/...00...0000001000
1050810509Images/IISc_VaaniProject_Bellary-SPECIFIC_0020...42417224['Karnataka']['Bellary']{'Kannada': 31, 'Telugu': 9, 'English': 2}{'Kannada': ['https://vaani.iisc.ac.in/Audios/...00...0000001000
..................................................................
7127571276Images/IISc_VaaniProject_Krishna-SPECIFIC_0070...287351129['AndhraPradesh']['Krishna']{'Telugu': 25, 'English': 3}{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...00...0000001000
7137271373Images/IISc_VaaniProject_Krishna-SPECIFIC_0080...317354457['AndhraPradesh']['Krishna']{'Telugu': 29, 'English': 2}{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...00...0000001000
7165071651Images/IISc_VaaniProject_Krishna-SPECIFIC_0107...347364295['AndhraPradesh']['Krishna']{'Telugu': 32, 'English': 2}{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...00...0000001000
7166571666Images/IISc_VaaniProject_Krishna-SPECIFIC_0109...417364827['AndhraPradesh']['Krishna']{'Telugu': 36, 'English': 3, 'Hindi': 2}{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...00...0000001000
7172971730Images/IISc_VaaniProject_Krishna-SPECIFIC_0115...327367046['AndhraPradesh']['Krishna']{'Telugu': 25, 'Hindi': 4, 'English': 3}{'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr...00...0000001000
\n", "

468 rows x 62 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "imageBY.loc[imageBY.English == 1]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 {'Telugu': 45, 'Hindi': 7}\n", "1 {'Telugu': 35, 'Hindi': 6}\n", "2 {'Telugu': 45, 'Hindi': 6, 'Bengali': 2}\n", "3 {'Telugu': 32, 'Hindi': 5, 'Urdu': 1}\n", "4 {'Telugu': 48, 'Hindi': 7, 'Urdu': 2}\n", " ... \n", "128802 {'Telugu': 35}\n", "128803 {'Telugu': 16, 'Hindi': 3}\n", "128804 {'Telugu': 15, 'Hindi': 2}\n", "128805 {'Telugu': 31}\n", "128806 {'Telugu': 35, 'Hindi': 1}\n", "Name: assertLanguage, Length: 128807, dtype: object" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imageBY.assertLanguage" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# imageBY_data = {'imageFileName': [], 'state': [], 'district': [],}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Image - Audio(Hindi) for CSIP" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Original JSON is not updated with Delhi" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "# import fireducks.pandas as pd\n", "from tqdm import tqdm, trange\n", "\n", "HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n", "IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\"\n", "JSON_PATH = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Vaani_IIsc_Artpark_Full_Data.json\"" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfile_namefile_urlmetadata
02IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
13IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
24IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
35IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
46IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...
...............
95849279696433IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
95849289696434IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
95849299696435IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
95849309696436IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
95849319696437IISc_VaaniProject_M_KA_Chamrajn_42017276_15323...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...
\n", "

9584932 rows × 4 columns

\n", "
" ], "text/plain": [ " id file_name \\\n", "0 2 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "1 3 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "2 4 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "3 5 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "4 6 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "... ... ... \n", "9584927 9696433 IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... \n", "9584928 9696434 IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... \n", "9584929 9696435 IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... \n", "9584930 9696436 IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... \n", "9584931 9696437 IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... \n", "\n", " file_url \\\n", "0 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "1 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "2 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "3 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "4 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "... ... \n", "9584927 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "9584928 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "9584929 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "9584930 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "9584931 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "\n", " metadata \n", "0 {'state': 'Maharashtra', 'gender': 'female', '... \n", "1 {'state': 'Maharashtra', 'gender': 'female', '... \n", "2 {'state': 'Maharashtra', 'gender': 'female', '... \n", "3 {'state': 'Maharashtra', 'gender': 'female', '... \n", "4 {'state': 'Maharashtra', 'gender': 'female', '... \n", "... ... \n", "9584927 {'state': 'Karnataka', 'gender': 'female', 'pi... \n", "9584928 {'state': 'Karnataka', 'gender': 'female', 'pi... \n", "9584929 {'state': 'Karnataka', 'gender': 'female', 'pi... \n", "9584930 {'state': 'Karnataka', 'gender': 'female', 'pi... \n", "9584931 {'state': 'Karnataka', 'gender': 'female', 'pi... \n", "\n", "[9584932 rows x 4 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jsondf = pd.read_json(JSON_PATH)\n", "jsondf" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 9584932/9584932 [00:02<00:00, 4578604.24it/s]\n" ] } ], "source": [ "for i in tqdm(jsondf.file_name.values):\n", " # if 'Delhi' in i:\n", " if i == 'IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav':\n", " print(i)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfile_namefile_urlmetadataassertLanguageaudioFileNameaudioManualQCaudioQualityCheckdistrictgender...imageFaceData.face_90.facial_areaimageFaceData.face_91.facial_areaimageFaceData.face_92.facial_areaimageFaceData.face_93.facial_areaimageFaceData.face_94.facial_areaimageFaceData.face_95.facial_areaimageFaceData.face_96.facial_areaimageFaceData.face_97.facial_areaimageFaceData.face_98.facial_areaimageFaceData.face_99.facial_area
02IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomatedAurangabadfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
13IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomatedAurangabadfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
24IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomatedAurangabadfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
35IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomatedAurangabadfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
46IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'state': 'Maharashtra', 'gender': 'female', '...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomated&ManualAurangabadfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
95849279696433IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomatedChamarajanagarfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
95849289696434IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomated&ManualChamarajanagarfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
95849299696435IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomatedChamarajanagarfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
95849309696436IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomatedChamarajanagarfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
95849319696437IISc_VaaniProject_M_KA_Chamrajn_42017276_15323...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'state': 'Karnataka', 'gender': 'female', 'pi...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomatedChamarajanagarfemale...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

9584932 rows × 251 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fullJSON = pd.concat([jsondf, pd.json_normalize(jsondf.metadata)], axis=1)\n", "fullJSON" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# fullJSON.to_parquet(\"Vaani-Images-Audio-JSON.parquet\", index=False)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfile_namefile_urlmetadataassertLanguageaudioFileNameaudioManualQCaudioQualityCheckdistrictgender...imageFaceData.face_90.facial_areaimageFaceData.face_91.facial_areaimageFaceData.face_92.facial_areaimageFaceData.face_93.facial_areaimageFaceData.face_94.facial_areaimageFaceData.face_95.facial_areaimageFaceData.face_96.facial_areaimageFaceData.face_97.facial_areaimageFaceData.face_98.facial_areaimageFaceData.face_99.facial_area
02IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'assertLanguage': 'Marathi', 'audioFileName':...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomatedAurangabadfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
13IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'assertLanguage': 'Marathi', 'audioFileName':...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomatedAurangabadfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
24IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'assertLanguage': 'Marathi', 'audioFileName':...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomatedAurangabadfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
35IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'assertLanguage': 'Marathi', 'audioFileName':...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomatedAurangabadfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
46IISc_VaaniProject_S_Maharashtra_Aurangabad_952...https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...{'assertLanguage': 'Marathi', 'audioFileName':...MarathiAudios/Aurangabad/IISc_VaaniProject_S_Maharash...NaNAutomated&ManualAurangabadfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
..................................................................
95849279696433IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'assertLanguage': 'Kannada', 'audioFileName':...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomatedChamarajanagarfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
95849289696434IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'assertLanguage': 'Kannada', 'audioFileName':...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomated&ManualChamarajanagarfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
95849299696435IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'assertLanguage': 'Kannada', 'audioFileName':...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomatedChamarajanagarfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
95849309696436IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'assertLanguage': 'Kannada', 'audioFileName':...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomatedChamarajanagarfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
95849319696437IISc_VaaniProject_M_KA_Chamrajn_42017276_15323...https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...{'assertLanguage': 'Kannada', 'audioFileName':...KannadaAudios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...NaNAutomatedChamarajanagarfemale...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
\n", "

9584932 rows × 251 columns

\n", "
" ], "text/plain": [ " id file_name \\\n", "0 2 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "1 3 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "2 4 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "3 5 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "4 6 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n", "... ... ... \n", "9584927 9696433 IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... \n", "9584928 9696434 IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... \n", "9584929 9696435 IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... \n", "9584930 9696436 IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... \n", "9584931 9696437 IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... \n", "\n", " file_url \\\n", "0 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "1 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "2 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "3 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "4 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n", "... ... \n", "9584927 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "9584928 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "9584929 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "9584930 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "9584931 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n", "\n", " metadata assertLanguage \\\n", "0 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n", "1 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n", "2 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n", "3 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n", "4 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n", "... ... ... \n", "9584927 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n", "9584928 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n", "9584929 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n", "9584930 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n", "9584931 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n", "\n", " audioFileName audioManualQC \\\n", "0 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n", "1 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n", "2 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n", "3 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n", "4 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n", "... ... ... \n", "9584927 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n", "9584928 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n", "9584929 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n", "9584930 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n", "9584931 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n", "\n", " audioQualityCheck district gender ... \\\n", "0 Automated Aurangabad female ... \n", "1 Automated Aurangabad female ... \n", "2 Automated Aurangabad female ... \n", "3 Automated Aurangabad female ... \n", "4 Automated&Manual Aurangabad female ... \n", "... ... ... ... ... \n", "9584927 Automated Chamarajanagar female ... \n", "9584928 Automated&Manual Chamarajanagar female ... \n", "9584929 Automated Chamarajanagar female ... \n", "9584930 Automated Chamarajanagar female ... \n", "9584931 Automated Chamarajanagar female ... \n", "\n", " imageFaceData.face_90.facial_area imageFaceData.face_91.facial_area \\\n", "0 None None \n", "1 None None \n", "2 None None \n", "3 None None \n", "4 None None \n", "... ... ... \n", "9584927 None None \n", "9584928 None None \n", "9584929 None None \n", "9584930 None None \n", "9584931 None None \n", "\n", " imageFaceData.face_92.facial_area imageFaceData.face_93.facial_area \\\n", "0 None None \n", "1 None None \n", "2 None None \n", "3 None None \n", "4 None None \n", "... ... ... \n", "9584927 None None \n", "9584928 None None \n", "9584929 None None \n", "9584930 None None \n", "9584931 None None \n", "\n", " imageFaceData.face_94.facial_area imageFaceData.face_95.facial_area \\\n", "0 None None \n", "1 None None \n", "2 None None \n", "3 None None \n", "4 None None \n", "... ... ... \n", "9584927 None None \n", "9584928 None None \n", "9584929 None None \n", "9584930 None None \n", "9584931 None None \n", "\n", " imageFaceData.face_96.facial_area imageFaceData.face_97.facial_area \\\n", "0 None None \n", "1 None None \n", "2 None None \n", "3 None None \n", "4 None None \n", "... ... ... \n", "9584927 None None \n", "9584928 None None \n", "9584929 None None \n", "9584930 None None \n", "9584931 None None \n", "\n", " imageFaceData.face_98.facial_area imageFaceData.face_99.facial_area \n", "0 None None \n", "1 None None \n", "2 None None \n", "3 None None \n", "4 None None \n", "... ... ... \n", "9584927 None None \n", "9584928 None None \n", "9584929 None None \n", "9584930 None None \n", "9584931 None None \n", "\n", "[9584932 rows x 251 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fullJSON = pd.read_parquet(\"Vaani-Images-Audio-JSON.parquet\")\n", "fullJSON" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/9584932 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfile_namefile_urlmetadataassertLanguageaudioFileNameaudioManualQCaudioQualityCheckdistrictgender...imageFaceData.face_90.facial_areaimageFaceData.face_91.facial_areaimageFaceData.face_92.facial_areaimageFaceData.face_93.facial_areaimageFaceData.face_94.facial_areaimageFaceData.face_95.facial_areaimageFaceData.face_96.facial_areaimageFaceData.face_97.facial_areaimageFaceData.face_98.facial_areaimageFaceData.face_99.facial_area
\n", "

0 rows × 251 columns

\n", "" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, file_name, file_url, metadata, assertLanguage, audioFileName, audioManualQC, audioQualityCheck, district, gender, imageFaceData, imageFileName, languagesSpoken, pincode, speakerImageHash, state, stay(years), transcript, transcriptQualityCheck, imageFaceData.face_1.facial_area, imageFaceData.face_10, imageFaceData.face_100, imageFaceData.face_101, imageFaceData.face_102, imageFaceData.face_103, imageFaceData.face_104, imageFaceData.face_105, imageFaceData.face_106, imageFaceData.face_107, imageFaceData.face_108, imageFaceData.face_109, imageFaceData.face_11, imageFaceData.face_110, imageFaceData.face_111, imageFaceData.face_12, imageFaceData.face_13, imageFaceData.face_14, imageFaceData.face_15, imageFaceData.face_16, imageFaceData.face_17, imageFaceData.face_18, imageFaceData.face_19, imageFaceData.face_2.facial_area, imageFaceData.face_20, imageFaceData.face_21, imageFaceData.face_22, imageFaceData.face_23, imageFaceData.face_24, imageFaceData.face_25, imageFaceData.face_26, imageFaceData.face_27, imageFaceData.face_28, imageFaceData.face_29, imageFaceData.face_3.facial_area, imageFaceData.face_30, imageFaceData.face_31, imageFaceData.face_32, imageFaceData.face_33, imageFaceData.face_34, imageFaceData.face_35, imageFaceData.face_36, imageFaceData.face_37, imageFaceData.face_38, imageFaceData.face_39, imageFaceData.face_4.facial_area, imageFaceData.face_40, imageFaceData.face_41, imageFaceData.face_42, imageFaceData.face_43, imageFaceData.face_44, imageFaceData.face_45, imageFaceData.face_46, imageFaceData.face_47, imageFaceData.face_48, imageFaceData.face_49, imageFaceData.face_5.facial_area, imageFaceData.face_50, imageFaceData.face_51, imageFaceData.face_52, imageFaceData.face_53, imageFaceData.face_54, imageFaceData.face_55, imageFaceData.face_56, imageFaceData.face_57, imageFaceData.face_58, imageFaceData.face_59, imageFaceData.face_6.facial_area, imageFaceData.face_60, imageFaceData.face_61, imageFaceData.face_62, imageFaceData.face_63, imageFaceData.face_64, imageFaceData.face_65, imageFaceData.face_66, imageFaceData.face_67, imageFaceData.face_68, imageFaceData.face_69, imageFaceData.face_7.facial_area, imageFaceData.face_70, imageFaceData.face_71, ...]\n", "Index: []\n", "\n", "[0 rows x 251 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fullJSON.loc[fullJSON.state == 'Delhi']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([np.int64(2),\n", " 'IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',\n", " 'https://vaani.iisc.ac.in/Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',\n", " {'assertLanguage': 'Marathi', 'audioFileName': 'Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav', 'audioManualQC': None, 'audioQualityCheck': 'Automated', 'district': 'Aurangabad', 'gender': 'female', 'imageFaceData': None, 'imageFileName': 'Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg', 'languagesSpoken': array(['Marathi'], dtype=object), 'pincode': '431105', 'speakerImageHash': '(iq~v-nq +lTC]QXDCSnJ2~23=+|Nq~nn(', 'state': 'Maharashtra', 'stay(years)': 'Aurangabad(23)', 'transcript': None, 'transcriptQualityCheck': None},\n", " 'Marathi',\n", " 'Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',\n", " np.float64(nan), 'Automated', 'Aurangabad', 'female',\n", " np.float64(nan),\n", " 'Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg',\n", " array(['Marathi'], dtype=object), '431105',\n", " '(iq~v-nq +lTC]QXDCSnJ2~23=+|Nq~nn(', 'Maharashtra',\n", " 'Aurangabad(23)', None, None, None, np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), None, np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " None, np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), None,\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), None, np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), None, np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " None, np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), None,\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), None, np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), None, None, None, None, None, None, None, None,\n", " None, None, None, np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n", " np.float64(nan), np.float64(nan), None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None, None, None,\n", " None, None, None, None, None, None, None, None, None], dtype=object)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fullJSON.iloc[0,:].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statedistrictgenderassertLanguagefile_nameimageFileName
0AndhraPradeshAnantpurfemaleBengaliIISc_VaaniProject_S_AP_Anantpur_100778_1106180...Images/IISc_VaaniProject_Anantpur-SPECIFIC_009...
1AndhraPradeshAnantpurfemaleBengaliIISc_VaaniProject_S_AP_Anantpur_100778_1106122...Images/IISc_VaaniProject_Anantpur-SPECIFIC_001...
2AndhraPradeshAnantpurfemaleBengaliIISc_VaaniProject_S_AP_Anantpur_100778_1106132...Images/IISc_VaaniProject_Anantpur-SPECIFIC_007...
3AndhraPradeshAnantpurfemaleBengaliIISc_VaaniProject_S_AP_Anantpur_100778_1106123...Images/IISc_VaaniProject_Anantpur-SPECIFIC_010...
4AndhraPradeshAnantpurfemaleBengaliIISc_VaaniProject_S_AP_Anantpur_100778_1106123...Images/IISc_VaaniProject_Anantpur-SPECIFIC_005...
.....................
9584927WestBengalPuruliamaleSantaliIISc_VaaniProject_M_WB_Purulia_Guru45176_03235...Images/IISc_VaaniProject_Purulia-SPECIFIC_0165...
9584928WestBengalPuruliamaleSantaliIISc_VaaniProject_M_WB_Purulia_Guru45176_03313...Images/IISc_VaaniProject_GENERIC_0839.jpg
9584929WestBengalPuruliamaleSantaliIISc_VaaniProject_M_WB_Purulia_Guru45176_03292...Images/IISc_VaaniProject_GENERIC_0022.jpg
9584930WestBengalPuruliamaleSantaliIISc_VaaniProject_M_WB_Purulia_Guru45176_03050...Images/IISc_VaaniProject_Purulia-SPECIFIC_0162...
9584931WestBengalPuruliamaleSantaliIISc_VaaniProject_M_WB_Purulia_Kira23456_04200...Images/IISc_VaaniProject_Purulia-SPECIFIC_0069...
\n", "

9584932 rows x 6 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ImageAudioHindi_df = fullJSON[['state', 'district', 'gender', 'assertLanguage', 'file_name', 'imageFileName']]\n", "ImageAudioHindi_df = ImageAudioHindi_df.sort_values(by=['state', 'district', 'assertLanguage'], \n", " ascending=[True, True, True])\n", "ImageAudioHindi_df = ImageAudioHindi_df.reset_index(drop=True)\n", "# ImageAudioHindi_df.to_csv(\"Image-Audio-Hindi.csv\", index=False)\n", "ImageAudioHindi_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Image - Audio(Hindi) Dataloaders for CSIP" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "import os\n", "# import pandas as pd\n", "import fireducks.pandas as pd\n", "from tqdm import tqdm, trange\n", "\n", "HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n", "IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\"\n", "IMAGEAUDIOCSV = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv\"" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "def walkDIR(folder_path, include=None):\n", " file_list = []\n", " for root, _, files in os.walk(folder_path):\n", " for file in files:\n", " if include is None or any(file.endswith(ext) for ext in include):\n", " file_list.append(os.path.join(root, file))\n", " print(\"Files found:\", len(file_list))\n", " return file_list" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Files found: 38300\n" ] }, { "data": { "text/plain": [ "(128807, 38300)" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "images_files = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR) if i.endswith(\".jpg\")]\n", "audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])\n", "\n", "len(images_files), len(audio_files)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_TehriGarhwal-SPECIFIC_00863.jpg',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Budaun-SPECIFIC_00129.jpg']" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "images_files[:3]" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio_files[:3]" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani'" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.path.dirname(IMAGEDIR)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
audio_pathreferenceImagegenderstatedistrict
0IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7...Images/IISc_VaaniProject_GENERIC_0473.jpgFemaleDelhiNewDelhi
1IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4...Images/IISc_VaaniProject_GENERIC_1011.jpgFemaleDelhiNewDelhi
2IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6...Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015...FemaleDelhiNewDelhi
3IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0...Images/IISc_VaaniProject_GENERIC_0418.jpgMaleDelhiNewDelhi
4IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2...Images/IISc_VaaniProject_GENERIC_0851.jpgMaleDelhiNewDelhi
..................
38295IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594...MaleJharkhandRanchi
38296IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700...MaleJharkhandRanchi
38297IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941...MaleJharkhandRanchi
38298IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405...MaleJharkhandRanchi
38299IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01327...FemaleJharkhandRanchi
\n", "

38300 rows x 5 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)\n", "ImageAudioHindi_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Audio Image Mapping" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "IMAGEDIR, HINDI_AUDIO_DIR" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|\u001b[33m████████████████████████████████████████████████████████\u001b[0m| 38300/38300 [00:54<00:00, 704.27it/s]\u001b[0m\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
image_pathaudio_path
0/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
1/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
2/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
3/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
4/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
.........
22327/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22328/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22329/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22330/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22331/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
\n", "

22332 rows × 2 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "mapping_counter = 0\n", "available_img_audios = {\"image_path\":[], \"audio_path\":[]}\n", "\n", "for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):\n", " image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)\n", " audio_path = os.path.join(HINDI_AUDIO_DIR, f\"{row.state}_{row.district}\", row.audio_path)\n", " \n", " if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):\n", " available_img_audios['image_path'].append(image_path)\n", " available_img_audios['audio_path'].append(audio_path)\n", "\n", "available_img_audios_df = pd.DataFrame(available_img_audios)\n", "available_img_audios_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# available_img_audios_df.to_csv(\"available_img_audios.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((15632, 2), (6700, 2))" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# from sklearn.model_selection import train_test_split\n", "\n", "# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)\n", "# train_df = train_df.reset_index(drop=True)\n", "# test_df = test_df.reset_index(drop=True)\n", "# train_df.shape, test_df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# train_df.to_csv(\"available_img_audios_TRAIN.csv\", index=False)\n", "# test_df.to_csv(\"available_img_audios_TEST.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Audio Image Mapping for New Downloaded Images" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "# import pandas as pd\n", "import fireducks.pandas as pd\n", "from tqdm import tqdm, trange\n", "\n", "HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n", "IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\"\n", "IMAGEAUDIOCSV = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def walkDIR(folder_path, include=None):\n", " file_list = []\n", " for root, _, files in os.walk(folder_path):\n", " for file in files:\n", " if include is None or any(file.endswith(ext) for ext in include):\n", " file_list.append(os.path.join(root, file))\n", " print(\"Files found:\", len(file_list))\n", " return file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Files found: 38300\n" ] }, { "data": { "text/plain": [ "(128807, 38300)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "images_files = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR) if i.endswith(\".jpg\")]\n", "audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])\n", "\n", "len(images_files), len(audio_files)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_TehriGarhwal-SPECIFIC_00863.jpg',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Budaun-SPECIFIC_00129.jpg']" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "images_files[:3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "audio_files[:3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani'" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "os.path.dirname(IMAGEDIR)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "IMAGEDIR, HINDI_AUDIO_DIR" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|\u001b[33m████████████████████████████████████████████████████████\u001b[0m| 38300/38300 [00:54<00:00, 704.27it/s]\u001b[0m\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
image_pathaudio_path
0/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
1/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
2/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
3/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
4/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
.........
22327/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22328/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22329/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22330/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22331/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
\n", "

22332 rows × 2 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "mapping_counter = 0\n", "available_img_audios = {\"image_path\":[], \"audio_path\":[]}\n", "\n", "for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):\n", " image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)\n", " audio_path = os.path.join(HINDI_AUDIO_DIR, f\"{row.state}_{row.district}\", row.audio_path)\n", " \n", " if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):\n", " available_img_audios['image_path'].append(image_path)\n", " available_img_audios['audio_path'].append(audio_path)\n", "\n", "available_img_audios_df = pd.DataFrame(available_img_audios)\n", "available_img_audios_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# available_img_audios_df.to_csv(\"available_img_audios.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((15632, 2), (6700, 2))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# from sklearn.model_selection import train_test_split\n", "\n", "# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)\n", "# train_df = train_df.reset_index(drop=True)\n", "# test_df = test_df.reset_index(drop=True)\n", "# train_df.shape, test_df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# train_df.to_csv(\"available_img_audios_TRAIN.csv\", index=False)\n", "# test_df.to_csv(\"available_img_audios_TEST.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Audio Image Mapping for All Images" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os, shutil\n", "def copy_files_from_folders(name, source_folders, destination_folder):\n", " r'''\n", " Copies files from multiple source folders to a destination folder, \n", " renaming them based on the source folder type.\n", " Parameters:\n", " name (str): The name to be appended to the destination folder.\n", " source_folders (list): A list of paths to the source folders.\n", " destination_folder (str): The path to the destination folder.\n", " Returns:\n", " None\n", " The function creates a new folder inside the destination folder with \n", " the given name. It then iterates through each file in the source \n", " folders, renaming them based on the folder type ('story', 'highligits', or 'post') \n", " and copying them to the destination folder. If a file with the same name \n", " already exists in the destination folder, it is added to a list of duplicate \n", " files, which is printed at the end along with the total number of files copied.\n", " Example:\n", " name = 'Folder1'\n", " source_folders = srcdir\n", " destination_folder = dstdir\n", " copy_files_from_folders(name, source_folders, destination_folder)\n", " '''\n", " \n", " destination_folder = os.path.join(destination_folder, name)\n", " if not os.path.exists(destination_folder):\n", " os.makedirs(destination_folder)\n", " \n", " total_files = 0\n", " duplicate_files = []\n", " \n", " for source_folder in source_folders:\n", " print(source_folder) \n", " for file_name in tqdm(os.listdir(source_folder)):\n", " \n", " if 'story' in source_folder:\n", " if 'highligits' in source_folder:\n", " destination_file_name = ''.join(file_name.split('.')[:-1] + ['highligits']) + '.' + file_name.split('.')[-1] \n", " else:\n", " destination_file_name = ''.join(file_name.split('.')[:-1] + ['story']) + '.' + file_name.split('.')[-1]\n", " else:\n", " destination_file_name = ''.join(file_name.split('.')[:-1] + ['post']) + '.' + file_name.split('.')[-1]\n", " \n", " \n", " source_file_path = os.path.join(source_folder, file_name)\n", " destination_file_path = os.path.join(destination_folder, destination_file_name)\n", " if os.path.isfile(source_file_path):\n", " if os.path.isfile(destination_file_path):\n", " duplicate_files.append(destination_file_path)\n", " else:\n", " shutil.copy(source_file_path, destination_file_path)\n", " total_files += 1\n", "\n", " print(f'Total {total_files} files copies')\n", " for i in duplicate_files:\n", " print(i)\n", " \n", "\n", "\n", "name = 'Folder1'\n", "source_folders = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR)]\n", "destination_folder = r'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/'\n", "copy_files_from_folders(name, source_folders, destination_folder)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import subprocess\n", "import os\n", "from joblib import Parallel, delayed\n", "\n", "def merge_single_folder(src, target_folder, ignore_existing=False, dry_run=False):\n", " \"\"\"\n", " Merges contents of a single source folder into the target folder using rsync.\n", " \"\"\"\n", " if not os.path.exists(src):\n", " print(f\"Source folder does not exist: {src}\")\n", " return\n", "\n", " cmd = [\"rsync\", \"-ah\"]\n", " \n", " if ignore_existing:\n", " cmd.append(\"--ignore-existing\")\n", " if dry_run:\n", " cmd.append(\"--dry-run\")\n", "\n", " cmd += [f\"{src.rstrip('/')}/\", target_folder]\n", "\n", " print(f\"Merging '{src}' into '{target_folder}'\")\n", " subprocess.run(cmd, check=True)\n", " print(f\"{len(os.listdir(target_folder))} files currently in target\")\n", " print(\"-\" * 100)\n", "\n", "def merge_folders_with_rsync_parallel(source_folders, target_folder, ignore_existing=False, dry_run=False, n_jobs=-1):\n", " \"\"\"\n", " Parallel merge of multiple source folders into a target folder using rsync and joblib.\n", " \"\"\"\n", " if not os.path.exists(target_folder):\n", " os.makedirs(target_folder)\n", "\n", " Parallel(n_jobs=n_jobs, backend=\"loky\")(\n", " delayed(merge_single_folder)(src, target_folder, ignore_existing, dry_run)\n", " for src in source_folders\n", " )\n", "\n", "\n", "IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/From-Images-Config/\"\n", "source_dirs = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR)]\n", "target_dir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3'\n", "\n", "merge_folders_with_rsync_parallel(source_dirs, target_dir, ignore_existing=False, dry_run=False)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import os\n", "# import pandas as pd\n", "import fireducks.pandas as pd\n", "from tqdm import tqdm, trange\n", "\n", "HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n", "IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/\"\n", "IMAGEAUDIOCSV = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
audio_pathreferenceImagegenderstatedistrict
0IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7...Images/IISc_VaaniProject_GENERIC_0473.jpgFemaleDelhiNewDelhi
1IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4...Images/IISc_VaaniProject_GENERIC_1011.jpgFemaleDelhiNewDelhi
2IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6...Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015...FemaleDelhiNewDelhi
3IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0...Images/IISc_VaaniProject_GENERIC_0418.jpgMaleDelhiNewDelhi
4IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2...Images/IISc_VaaniProject_GENERIC_0851.jpgMaleDelhiNewDelhi
..................
38295IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594...MaleJharkhandRanchi
38296IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700...MaleJharkhandRanchi
38297IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941...MaleJharkhandRanchi
38298IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405...MaleJharkhandRanchi
38299IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01327...FemaleJharkhandRanchi
\n", "

38300 rows x 5 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)\n", "ImageAudioHindi_df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def walkDIR(folder_path, include=None):\n", " file_list = []\n", " for root, _, files in os.walk(folder_path):\n", " for file in files:\n", " if include is None or any(file.endswith(ext) for ext in include):\n", " file_list.append(os.path.join(root, file))\n", " print(\"Files found:\", len(file_list))\n", " return file_list" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Files found: 284593\n", "Files found: 73755\n" ] }, { "data": { "text/plain": [ "(284593, 73755)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "images_files = walkDIR(IMAGEDIR, include=['.jpg', '.png'])\n", "audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])\n", "\n", "len(images_files), len(audio_files)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_Churu-SPECIFIC_00422.jpg',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_LowerDibangvalley-SPECIFIC_01474.jpg',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_Khordha-SPECIFIC_02034.jpg']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "images_files[:3]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio_files[:3]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.path.dirname(IMAGEDIR)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.path.dirname(os.path.dirname(IMAGEDIR))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/',\n", " '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "IMAGEDIR, HINDI_AUDIO_DIR" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|\u001b[33m████████████████████████████████████████████████████████\u001b[0m| 38300/38300 [00:46<00:00, 829.07it/s]\u001b[0m\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
image_pathaudio_path
0/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
1/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
2/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
3/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
4/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
.........
38295/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
38296/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
38297/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
38298/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
38299/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan.../scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
\n", "

38300 rows × 2 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "mapping_counter = 0\n", "available_img_audios = {\"image_path\":[], \"audio_path\":[]}\n", "\n", "for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):\n", " image_path = os.path.join(os.path.dirname(IMAGEDIR), os.path.basename(row.referenceImage))\n", " audio_path = os.path.join(HINDI_AUDIO_DIR, f\"{row.state}_{row.district}\", row.audio_path)\n", " \n", " if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):\n", " available_img_audios['image_path'].append(image_path)\n", " available_img_audios['audio_path'].append(audio_path)\n", "\n", "available_img_audios_df = pd.DataFrame(available_img_audios)\n", "available_img_audios_df" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%|\u001b[33m \u001b[0m| 0/38300 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
image_pathaudio_path
\n", "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# mapping_counter = 0\n", "# available_img_audios = {\"image_path\":[], \"audio_path\":[]}\n", "\n", "# for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):\n", "# image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)\n", "# audio_path = os.path.join(HINDI_AUDIO_DIR, f\"{row.state}_{row.district}\", row.audio_path)\n", " \n", "# if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):\n", "# available_img_audios['image_path'].append(image_path)\n", "# available_img_audios['audio_path'].append(audio_path)\n", "\n", "# available_img_audios_df = pd.DataFrame(available_img_audios)\n", "# available_img_audios_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# available_img_audios_df.to_csv(\"available_img_audios.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((26810, 2), (11490, 2))" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# from sklearn.model_selection import train_test_split\n", "\n", "# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)\n", "# train_df = train_df.reset_index(drop=True)\n", "# test_df = test_df.reset_index(drop=True)\n", "# train_df.shape, test_df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# train_df.to_csv(\"available_img_audios_TRAIN2.csv\", index=False)\n", "# test_df.to_csv(\"available_img_audios_TEST2.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Audio Image Mapping for All Images Polars MetaData" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "# import pandas as pd\n", "import fireducks.pandas as pd\n", "import polars as pl\n", "from tqdm import tqdm, trange\n", "\n", "HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n", "IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/\"\n", "IMAGEAUDIOCSV = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
audio_pathreferenceImagegenderstatedistrict
0IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7...Images/IISc_VaaniProject_GENERIC_0473.jpgFemaleDelhiNewDelhi
1IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4...Images/IISc_VaaniProject_GENERIC_1011.jpgFemaleDelhiNewDelhi
2IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6...Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015...FemaleDelhiNewDelhi
3IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0...Images/IISc_VaaniProject_GENERIC_0418.jpgMaleDelhiNewDelhi
4IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2...Images/IISc_VaaniProject_GENERIC_0851.jpgMaleDelhiNewDelhi
..................
38295IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594...MaleJharkhandRanchi
38296IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700...MaleJharkhandRanchi
38297IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941...MaleJharkhandRanchi
38298IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405...MaleJharkhandRanchi
38299IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...Images/IISc_VaaniProject_Ranchi-SPECIFIC_01327...FemaleJharkhandRanchi
\n", "

38300 rows x 5 columns

\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)\n", "ImageAudioHindi_df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.2" } }, "nbformat": 4, "nbformat_minor": 2 }