{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from pyspark.sql import SparkSession\n",
"import pandas as pd\n",
"import polars as pl\n",
"from tqdm.auto import tqdm, trange\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"SCRATCH = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani\"\n",
"DATADIR = r\"/home/IITB/ai-at-ieor/23m1521/datasets/Vaani\"\n",
"JSON_PATH = os.path.join(DATADIR, \"Vaani_IIsc_Artpark_Full_Data.json\")\n",
"# IMAGES_PATH = os.path.join(SCRATCH, \"Images\")\n",
"IMAGES_PARQUETS = os.path.join(SCRATCH, \"images_parquets\")\n",
"AUDIO_URLS = \"audio_urls.txt\"\n",
"IMAGES_URLS = \"images_urls.txt\"\n",
"IMAGE_ROOT_URL = 'https://vaani.iisc.ac.in/'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2fd393fddd534c30ae2674438039ec69",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/9584932 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data_dict = {'image_name': [], 'state': [], 'district': [], 'gender': [], 'audio_language': [], 'audio_name': []}\n",
"\n",
"with open(JSON_PATH, 'r') as json_file:\n",
" for i, line in tqdm(enumerate(json_file), total=9584932):\n",
" # if i == 15: break\n",
" line = json.loads(line.strip()[1:-1] if i == 0 else line.strip()[:-1])\n",
"\n",
" image_name = line['metadata']['imageFileName']\n",
" image_path = os.path.join(SCRATCH, image_name)\n",
" image_state = line['metadata']['state']\n",
" image_district = line['metadata']['district']\n",
" image_gender = line['metadata']['gender']\n",
" audio_language = line['metadata']['languagesSpoken'][0]\n",
" audio_name = line['metadata']['audioFileName']\n",
" \n",
" # print(image_name)\n",
" # print(line)\n",
"\n",
" # if os.path.isfile(image_path):\n",
" data_dict['image_name'].append(image_name)\n",
" data_dict['state'].append(image_state)\n",
" data_dict['district'].append(image_district)\n",
" data_dict['gender'].append(image_gender)\n",
" data_dict['audio_language'].append(audio_language)\n",
" data_dict['audio_name'].append(audio_name)\n",
"\n",
" # print(\"-\" * 100)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "image_name",
"rawType": "object",
"type": "string"
},
{
"name": "state",
"rawType": "object",
"type": "string"
},
{
"name": "district",
"rawType": "object",
"type": "string"
},
{
"name": "gender",
"rawType": "object",
"type": "string"
},
{
"name": "audio_language",
"rawType": "object",
"type": "string"
},
{
"name": "audio_name",
"rawType": "object",
"type": "string"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "a06018ae-01e4-43b6-a0b6-a74c24d6982a",
"rows": [
[
"0",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav"
],
[
"1",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00404.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885688_MRACH_323525_10676_14796.wav"
],
[
"2",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00365.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885773_MRACS_92011_0_13216.wav"
],
[
"3",
"Images/IISc_VaaniProject_GENERIC_0073.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885423_MRABC_323380_11266_20399.wav"
],
[
"4",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01252.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10886492_MRACH_80994_515_9950.wav"
],
[
"5",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00369.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885712_MRACLT_323569_13981_21501.wav"
],
[
"6",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00404.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885688_MRACH_323525_0_5754.wav"
],
[
"7",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00137.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885641_MRACLT_323571_12610_21310.wav"
],
[
"8",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01082.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885477_MRACH_323524_12546_20926.wav"
],
[
"9",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01272.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885675_MRACH_80188_9499_19963.wav"
],
[
"10",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01244.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885904_MRACLT_323572_818_11691.wav"
],
[
"11",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00137.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885641_MRACLT_323571_498_12290.wav"
],
[
"12",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01082.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885477_MRACH_323524_1655_12345.wav"
],
[
"13",
"Images/IISc_VaaniProject_GENERIC_0073.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885423_MRABC_323380_200_11206.wav"
],
[
"14",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00034.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10886672_MRADM_236186_498_9130.wav"
],
[
"15",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00404.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885688_MRACH_323525_6190_10263.wav"
],
[
"16",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00365.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885773_MRACS_92011_13230_20402.wav"
],
[
"17",
"Images/IISc_VaaniProject_GENERIC_0179.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10881852_KTCMMAO_300177_498_15241.wav"
],
[
"18",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00034.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10886672_MRADM_236186_9170_20846.wav"
],
[
"19",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00997.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885503_MRACH_323530_0_4589.wav"
],
[
"20",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_10473_19952.wav"
],
[
"21",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00997.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885503_MRACH_323530_4816_19125.wav"
],
[
"22",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01244.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885904_MRACLT_323572_12033_19988.wav"
],
[
"23",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01252.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10886492_MRACH_80994_9950_20461.wav"
],
[
"24",
"Images/IISc_VaaniProject_GENERIC_0179.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10881852_KTCMMAO_300177_15841_20461.wav"
],
[
"25",
"Images/IISc_VaaniProject_GENERIC_1041.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11157562_UPVNFR_142622_13_10446.wav"
],
[
"26",
"Images/IISc_VaaniProject_GENERIC_1037.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160952_UPVNTA_123296_0_12723.wav"
],
[
"27",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00369.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885712_MRACLT_323569_498_13741.wav"
],
[
"28",
"Images/IISc_VaaniProject_GENERIC_0980.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160070_UPVNHT_161133_2_11717.wav"
],
[
"29",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00404.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885688_MRACH_323525_15090_20194.wav"
],
[
"30",
"Images/IISc_VaaniProject_GENERIC_0857.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11157614_TGNLFLM_14947_11483_17575.wav"
],
[
"31",
"Images/IISc_VaaniProject_GENERIC_0784.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11161711_UPVNREI_136144_0_11475.wav"
],
[
"32",
"Images/IISc_VaaniProject_GENERIC_0991.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11159902_UPVNML_191445_0_10486.wav"
],
[
"33",
"Images/IISc_VaaniProject_GENERIC_1037.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160952_UPVNTA_123296_12723_14915.wav"
],
[
"34",
"Images/IISc_VaaniProject_GENERIC_1004.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11159231_UPVNHC_147120_0_9681.wav"
],
[
"35",
"Images/IISc_VaaniProject_GENERIC_0798.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11158013_UPVNFN_199962_0_12217.wav"
],
[
"36",
"Images/IISc_VaaniProject_GENERIC_1061.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11159169_TGNLSL_79868_2_11538.wav"
],
[
"37",
"Images/IISc_VaaniProject_GENERIC_0902.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11158905_UPVNIM_17694_0_9213.wav"
],
[
"38",
"Images/IISc_VaaniProject_GENERIC_1028.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11158685_UPVNEX_152314_0_11230.wav"
],
[
"39",
"Images/IISc_VaaniProject_GENERIC_0850.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11157526_UPVNHO_17386_0_10631.wav"
],
[
"40",
"Images/IISc_VaaniProject_GENERIC_0723.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160293_UPVNLK_136180_0_10991.wav"
],
[
"41",
"Images/IISc_VaaniProject_GENERIC_0857.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11157614_TGNLFLM_14947_565_11483.wav"
],
[
"42",
"Images/IISc_VaaniProject_GENERIC_1033.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160176_UPVNGOB_3297_107_14754.wav"
],
[
"43",
"Images/IISc_VaaniProject_GENERIC_0824.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160850_UPVNRI_11356_0_11078.wav"
],
[
"44",
"Images/IISc_VaaniProject_GENERIC_0830.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11158620_TGNLST_11812_0_9545.wav"
],
[
"45",
"Images/IISc_VaaniProject_GENERIC_0923.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160686_UPVNMO_145096_0_12159.wav"
],
[
"46",
"Images/IISc_VaaniProject_GENERIC_0797.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160624_UPVNSU_55591_2_10857.wav"
],
[
"47",
"Images/IISc_VaaniProject_Solapur-SPECIFIC_00231.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_72631_9964768_MRSREI_271382_0_12602.wav"
],
[
"48",
"Images/IISc_VaaniProject_GENERIC_0991.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11159902_UPVNML_191445_10486_13559.wav"
],
[
"49",
"Images/IISc_VaaniProject_Solapur-SPECIFIC_01764.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_72631_9962743_MRSOTHER_274077_5208_16951.wav"
]
],
"shape": {
"columns": 6,
"rows": 9584932
}
},
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" image_name | \n",
" state | \n",
" district | \n",
" gender | \n",
" audio_language | \n",
" audio_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" 1 | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" 2 | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" 3 | \n",
" Images/IISc_VaaniProject_GENERIC_0073.jpg | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" 4 | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 9584927 | \n",
" Images/IISc_VaaniProject_GENERIC_0554.jpg | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
" 9584928 | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
" 9584929 | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
" 9584930 | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
" 9584931 | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
"
\n",
"
9584932 rows × 6 columns
\n",
"
"
],
"text/plain": [
" image_name state \\\n",
"0 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n",
"1 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n",
"2 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n",
"3 Images/IISc_VaaniProject_GENERIC_0073.jpg Maharashtra \n",
"4 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n",
"... ... ... \n",
"9584927 Images/IISc_VaaniProject_GENERIC_0554.jpg Karnataka \n",
"9584928 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n",
"9584929 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n",
"9584930 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n",
"9584931 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n",
"\n",
" district gender audio_language \\\n",
"0 Aurangabad female Marathi \n",
"1 Aurangabad female Marathi \n",
"2 Aurangabad female Marathi \n",
"3 Aurangabad female Marathi \n",
"4 Aurangabad female Marathi \n",
"... ... ... ... \n",
"9584927 Chamarajanagar female Kannada \n",
"9584928 Chamarajanagar female Kannada \n",
"9584929 Chamarajanagar female Kannada \n",
"9584930 Chamarajanagar female Kannada \n",
"9584931 Chamarajanagar female Kannada \n",
"\n",
" audio_name \n",
"0 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"1 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"2 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"3 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"4 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"... ... \n",
"9584927 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"9584928 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"9584929 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"9584930 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"9584931 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"\n",
"[9584932 rows x 6 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(data_dict)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(128807,)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.image_name.unique().shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"///////////////////////////////////////////////////////////////////////"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "81ecfc03838d46ac9870a5bc942607d7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/9584932 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data_dict = {'image_name': [], 'state': [], 'district': [], 'gender': [], 'audio_language': [], 'audio_name': []}\n",
"\n",
"with open(JSON_PATH, 'r') as json_file:\n",
" for i, line in tqdm(enumerate(json_file), total=9584932):\n",
" # if i == 15: break\n",
" line = json.loads(line.strip()[1:-1] if i == 0 else line.strip()[:-1])\n",
"\n",
" image_name = line['metadata']['imageFileName']\n",
" image_path = os.path.join(SCRATCH, image_name)\n",
" image_state = line['metadata']['state']\n",
" image_district = line['metadata']['district']\n",
" image_gender = line['metadata']['gender']\n",
" audio_language = line['metadata']['languagesSpoken'][0]\n",
" audio_name = line['metadata']['audioFileName']\n",
" \n",
" # print(image_name)\n",
" # print(line)\n",
"\n",
" if os.path.isfile(image_path):\n",
" data_dict['image_name'].append(image_name)\n",
" data_dict['state'].append(image_state)\n",
" data_dict['district'].append(image_district)\n",
" data_dict['gender'].append(image_gender)\n",
" data_dict['audio_language'].append(audio_language)\n",
" data_dict['audio_name'].append(audio_name)\n",
"\n",
" # print(\"-\" * 100)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "image_name",
"rawType": "object",
"type": "string"
},
{
"name": "state",
"rawType": "object",
"type": "string"
},
{
"name": "district",
"rawType": "object",
"type": "string"
},
{
"name": "gender",
"rawType": "object",
"type": "string"
},
{
"name": "audio_language",
"rawType": "object",
"type": "string"
},
{
"name": "audio_name",
"rawType": "object",
"type": "string"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "d9fc0d76-3fec-4a97-b740-2ceda3a72ec2",
"rows": [
[
"0",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav"
],
[
"1",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00404.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885688_MRACH_323525_10676_14796.wav"
],
[
"2",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00365.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885773_MRACS_92011_0_13216.wav"
],
[
"3",
"Images/IISc_VaaniProject_GENERIC_0073.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885423_MRABC_323380_11266_20399.wav"
],
[
"4",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01252.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10886492_MRACH_80994_515_9950.wav"
],
[
"5",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00369.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885712_MRACLT_323569_13981_21501.wav"
],
[
"6",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00404.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885688_MRACH_323525_0_5754.wav"
],
[
"7",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00137.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885641_MRACLT_323571_12610_21310.wav"
],
[
"8",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01082.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885477_MRACH_323524_12546_20926.wav"
],
[
"9",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01272.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885675_MRACH_80188_9499_19963.wav"
],
[
"10",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01244.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885904_MRACLT_323572_818_11691.wav"
],
[
"11",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00137.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885641_MRACLT_323571_498_12290.wav"
],
[
"12",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01082.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885477_MRACH_323524_1655_12345.wav"
],
[
"13",
"Images/IISc_VaaniProject_GENERIC_0073.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885423_MRABC_323380_200_11206.wav"
],
[
"14",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00034.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10886672_MRADM_236186_498_9130.wav"
],
[
"15",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00404.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885688_MRACH_323525_6190_10263.wav"
],
[
"16",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00365.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885773_MRACS_92011_13230_20402.wav"
],
[
"17",
"Images/IISc_VaaniProject_GENERIC_0179.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10881852_KTCMMAO_300177_498_15241.wav"
],
[
"18",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00034.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10886672_MRADM_236186_9170_20846.wav"
],
[
"19",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00997.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885503_MRACH_323530_0_4589.wav"
],
[
"20",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_10473_19952.wav"
],
[
"21",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00997.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885503_MRACH_323530_4816_19125.wav"
],
[
"22",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01244.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885904_MRACLT_323572_12033_19988.wav"
],
[
"23",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_01252.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10886492_MRACH_80994_9950_20461.wav"
],
[
"24",
"Images/IISc_VaaniProject_GENERIC_0179.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10881852_KTCMMAO_300177_15841_20461.wav"
],
[
"25",
"Images/IISc_VaaniProject_GENERIC_1041.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11157562_UPVNFR_142622_13_10446.wav"
],
[
"26",
"Images/IISc_VaaniProject_GENERIC_1037.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160952_UPVNTA_123296_0_12723.wav"
],
[
"27",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00369.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885712_MRACLT_323569_498_13741.wav"
],
[
"28",
"Images/IISc_VaaniProject_GENERIC_0980.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160070_UPVNHT_161133_2_11717.wav"
],
[
"29",
"Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00404.jpg",
"Maharashtra",
"Aurangabad",
"female",
"Marathi",
"Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885688_MRACH_323525_15090_20194.wav"
],
[
"30",
"Images/IISc_VaaniProject_GENERIC_0857.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11157614_TGNLFLM_14947_11483_17575.wav"
],
[
"31",
"Images/IISc_VaaniProject_GENERIC_0784.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11161711_UPVNREI_136144_0_11475.wav"
],
[
"32",
"Images/IISc_VaaniProject_GENERIC_0991.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11159902_UPVNML_191445_0_10486.wav"
],
[
"33",
"Images/IISc_VaaniProject_GENERIC_1037.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160952_UPVNTA_123296_12723_14915.wav"
],
[
"34",
"Images/IISc_VaaniProject_GENERIC_1004.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11159231_UPVNHC_147120_0_9681.wav"
],
[
"35",
"Images/IISc_VaaniProject_GENERIC_0798.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11158013_UPVNFN_199962_0_12217.wav"
],
[
"36",
"Images/IISc_VaaniProject_GENERIC_1061.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11159169_TGNLSL_79868_2_11538.wav"
],
[
"37",
"Images/IISc_VaaniProject_GENERIC_0902.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11158905_UPVNIM_17694_0_9213.wav"
],
[
"38",
"Images/IISc_VaaniProject_GENERIC_1028.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11158685_UPVNEX_152314_0_11230.wav"
],
[
"39",
"Images/IISc_VaaniProject_GENERIC_0850.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11157526_UPVNHO_17386_0_10631.wav"
],
[
"40",
"Images/IISc_VaaniProject_GENERIC_0723.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160293_UPVNLK_136180_0_10991.wav"
],
[
"41",
"Images/IISc_VaaniProject_GENERIC_0857.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11157614_TGNLFLM_14947_565_11483.wav"
],
[
"42",
"Images/IISc_VaaniProject_GENERIC_1033.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160176_UPVNGOB_3297_107_14754.wav"
],
[
"43",
"Images/IISc_VaaniProject_GENERIC_0824.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160850_UPVNRI_11356_0_11078.wav"
],
[
"44",
"Images/IISc_VaaniProject_GENERIC_0830.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11158620_TGNLST_11812_0_9545.wav"
],
[
"45",
"Images/IISc_VaaniProject_GENERIC_0923.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160686_UPVNMO_145096_0_12159.wav"
],
[
"46",
"Images/IISc_VaaniProject_GENERIC_0797.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11160624_UPVNSU_55591_2_10857.wav"
],
[
"47",
"Images/IISc_VaaniProject_Solapur-SPECIFIC_00231.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_72631_9964768_MRSREI_271382_0_12602.wav"
],
[
"48",
"Images/IISc_VaaniProject_GENERIC_0991.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_102552_11159902_UPVNML_191445_10486_13559.wav"
],
[
"49",
"Images/IISc_VaaniProject_Solapur-SPECIFIC_01764.jpg",
"Maharashtra",
"Solapur",
"female",
"Marathi",
"Audios/Solapur/IISc_VaaniProject_S_Maharashtra_Solapur_72631_9962743_MRSOTHER_274077_5208_16951.wav"
]
],
"shape": {
"columns": 6,
"rows": 9584932
}
},
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" image_name | \n",
" state | \n",
" district | \n",
" gender | \n",
" audio_language | \n",
" audio_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" 1 | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" 2 | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" 3 | \n",
" Images/IISc_VaaniProject_GENERIC_0073.jpg | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" 4 | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 9584927 | \n",
" Images/IISc_VaaniProject_GENERIC_0554.jpg | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
" 9584928 | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
" 9584929 | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
" 9584930 | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
" 9584931 | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
"
\n",
" \n",
"
\n",
"
9584932 rows × 6 columns
\n",
"
"
],
"text/plain": [
" image_name state \\\n",
"0 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n",
"1 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n",
"2 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n",
"3 Images/IISc_VaaniProject_GENERIC_0073.jpg Maharashtra \n",
"4 Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... Maharashtra \n",
"... ... ... \n",
"9584927 Images/IISc_VaaniProject_GENERIC_0554.jpg Karnataka \n",
"9584928 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n",
"9584929 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n",
"9584930 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n",
"9584931 Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... Karnataka \n",
"\n",
" district gender audio_language \\\n",
"0 Aurangabad female Marathi \n",
"1 Aurangabad female Marathi \n",
"2 Aurangabad female Marathi \n",
"3 Aurangabad female Marathi \n",
"4 Aurangabad female Marathi \n",
"... ... ... ... \n",
"9584927 Chamarajanagar female Kannada \n",
"9584928 Chamarajanagar female Kannada \n",
"9584929 Chamarajanagar female Kannada \n",
"9584930 Chamarajanagar female Kannada \n",
"9584931 Chamarajanagar female Kannada \n",
"\n",
" audio_name \n",
"0 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"1 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"2 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"3 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"4 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... \n",
"... ... \n",
"9584927 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"9584928 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"9584929 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"9584930 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"9584931 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... \n",
"\n",
"[9584932 rows x 6 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(data_dict)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# df.to_parquet('Vaani-Images-Audio-MetaData.parquet', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Final ImageBy Full Meta, Pending"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"# import pandas as pd\n",
"import fireducks.pandas as pd\n",
"from tqdm import tqdm, trange\n",
"import matplotlib.pyplot as plt\n",
"\n",
"DATADIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/English\"\n",
"IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\" \n",
"FINAL_META = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/finalMETA.parquet\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" file_name | \n",
" file_url | \n",
" assertLanguage | \n",
" languagesSpoken | \n",
" state | \n",
" district | \n",
" gender | \n",
" audioFileName | \n",
" imageFileName | \n",
" pincode | \n",
" speakerImageHash | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" Marathi | \n",
" [Marathi] | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" 431105 | \n",
" (iq~v-nq +lTC]QXDCSnJ2~23=+|Nq~nn( | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" Marathi | \n",
" [Marathi] | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" 431105 | \n",
" (iq~v-qq +lTC]QXDCSnJ2~23=+|Nq~miz | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" Marathi | \n",
" [Marathi] | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" 431105 | \n",
" (iq~v.pl +lTC]QXDCSnJ2~23=+|Nq~lo{ | \n",
"
\n",
" \n",
" 3 | \n",
" 5 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" Marathi | \n",
" [Marathi] | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" Images/IISc_VaaniProject_GENERIC_0073.jpg | \n",
" 431105 | \n",
" (iq~v+kl 1<0~A3:Aivx* | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" Marathi | \n",
" [Marathi] | \n",
" Maharashtra | \n",
" Aurangabad | \n",
" female | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0... | \n",
" 431105 | \n",
" (iq~w+rk +lTC]QXDCSnJ2~23=+|Nq(knx | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 9584927 | \n",
" 9696433 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" Kannada | \n",
" [Kannada, English] | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" Images/IISc_VaaniProject_GENERIC_0554.jpg | \n",
" 571440 | \n",
" (oi~r)iivq )~=/I+|Nq,nm | \n",
"
\n",
" \n",
" 9584928 | \n",
" 9696434 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" Kannada | \n",
" [Kannada, English] | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" 571440 | \n",
" (oi{t(iivq |JPWiCL]K^CTs=G~|80@|Avq)mp | \n",
"
\n",
" \n",
" 9584929 | \n",
" 9696435 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" Kannada | \n",
" [Kannada, English] | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" 571440 | \n",
" (kl}q-iivq |JPWiCL]K^CTs=G~|80@|Avr(kr | \n",
"
\n",
" \n",
" 9584930 | \n",
" 9696436 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" Kannada | \n",
" [Kannada, English] | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" 571440 | \n",
" ~rk}s+iivq |JPWiCL]K^CTs=G~|80@|Avq*km | \n",
"
\n",
" \n",
" 9584931 | \n",
" 9696437 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" Kannada | \n",
" [Kannada, English] | \n",
" Karnataka | \n",
" Chamarajanagar | \n",
" female | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC... | \n",
" 571440 | \n",
" (nlxt+iivq |JPWiCL]K^CTs=G~|80@|Avr,lq | \n",
"
\n",
" \n",
"
\n",
"
9584932 rows x 12 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"finalMETA = pd.read_parquet(FINAL_META)\n",
"finalMETA"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand',\n",
" 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal',\n",
" 'Rajasthan', 'Uttarakhand', 'Goa'], dtype=object)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"finalMETA.state.unique()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Anantpur',\n",
" 'Araria',\n",
" 'Aurangabad',\n",
" 'Balrampur',\n",
" 'Bastar',\n",
" 'Begusarai',\n",
" 'Belgaum',\n",
" 'Bellary',\n",
" 'Bhagalpur',\n",
" 'Bijapur',\n",
" 'Bilaspur',\n",
" 'Budaun',\n",
" 'Chamarajanagar',\n",
" 'Chandrapur',\n",
" 'Chittoor',\n",
" 'Churu',\n",
" 'DakshinDinajpur',\n",
" 'DakshinaKannada',\n",
" 'Darbhanga',\n",
" 'Deoria',\n",
" 'Dharwad',\n",
" 'Dhule',\n",
" 'EastChamparan',\n",
" 'Etah',\n",
" 'Gaya',\n",
" 'Ghazipur',\n",
" 'Gopalganj',\n",
" 'Gorakhpur',\n",
" 'Gulbarga',\n",
" 'Guntur',\n",
" 'Hamirpur',\n",
" 'Jahanabad',\n",
" 'Jalaun',\n",
" 'Jalpaiguri',\n",
" 'Jamtara',\n",
" 'Jamui',\n",
" 'Jashpur',\n",
" 'Jhargram',\n",
" 'JyotibaPhuleNagar',\n",
" 'Kabirdham',\n",
" 'Karimnagar',\n",
" 'Kishanganj',\n",
" 'Kolkata',\n",
" 'Korba',\n",
" 'Krishna',\n",
" 'Lakhisarai',\n",
" 'Madhepura',\n",
" 'Malda',\n",
" 'Muzaffarnagar',\n",
" 'Muzaffarpur',\n",
" 'Mysore',\n",
" 'Nagaur',\n",
" 'Nagpur',\n",
" 'Nalgonda',\n",
" 'North24Parganas',\n",
" 'NorthSouthGoa',\n",
" 'PaschimMedinipur',\n",
" 'Pune',\n",
" 'Purnia',\n",
" 'Purulia',\n",
" 'Raichur',\n",
" 'Raigarh',\n",
" 'Rajnandgaon',\n",
" 'Saharsa',\n",
" 'Sahebganj',\n",
" 'Samastipur',\n",
" 'Saran',\n",
" 'Sarguja',\n",
" 'Shimoga',\n",
" 'Sindhudurg',\n",
" 'Sitamarhi',\n",
" 'Solapur',\n",
" 'Srikakulam',\n",
" 'Sukma',\n",
" 'Supaul',\n",
" 'TehriGarhwal',\n",
" 'Uttarkashi',\n",
" 'Vaishali',\n",
" 'Varanasi',\n",
" 'Vishakapattanam']"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(finalMETA.district.unique())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"128807"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(os.listdir(IMAGEDIR))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(128807,)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"finalMETA.imageFileName.unique().shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(54,)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"finalMETA.assertLanguage.unique().shape"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|\u001b[32m█████████████████████████\u001b[0m| 128807/128807 [29:41<00:00, 72.31it/s]\u001b[0m\n"
]
}
],
"source": [
"import csv\n",
"from tqdm import tqdm, trange\n",
"\n",
"\n",
"language_columns = sorted(finalMETA.assertLanguage.dropna().unique())\n",
"with open('imageBY3.csv', mode='w', newline='', encoding='utf-8') as file:\n",
" writer = csv.writer(file)\n",
" writer.writerow(\n",
" [\"id\", \"imageFileName\", \"audioCounts\", \"total\",\n",
" \"state\", \"district\", \"assertLanguage\", \"audio_urls\"] \n",
" + language_columns\n",
" )\n",
"\n",
" total = 0\n",
" write_rows_list = []\n",
"\n",
" pbar = tqdm(\n",
" finalMETA.groupby('imageFileName'),\n",
" colour=\"green\", ncols=70,\n",
" total=finalMETA['imageFileName'].nunique()\n",
" )\n",
"\n",
" for i, (image, data) in enumerate(pbar):\n",
" # if i == 3:\n",
" # break\n",
" assert_Languages = data['assertLanguage'].unique()\n",
" assert_Languages_url_dict = data.set_index('file_url')['assertLanguage'].to_dict()\n",
" audio_urls_dict = {lang: [url for url, v in assert_Languages_url_dict.items() if v == lang] for lang in assert_Languages}\n",
" \n",
" # state = [i.strip() for i in data['state'].unique()]\n",
" # district = [i.strip() for i in data['district'].unique()]\n",
" state = data['state'].unique().tolist()\n",
" district = data['district'].unique().tolist()\n",
" \n",
" audioCounts = data['assertLanguage'].count()\n",
" total += audioCounts\n",
" lang_counts = data['assertLanguage'].value_counts().to_dict()\n",
" \n",
" lang_row = [int(lang in lang_counts) for lang in language_columns]\n",
"\n",
" write_rows_list.append(\n",
" [i + 1, image, audioCounts, total, state, district, lang_counts, audio_urls_dict] \n",
" + lang_row\n",
" )\n",
"\n",
" if i % 1000 == 0 and i > 0:\n",
" writer.writerows(write_rows_list)\n",
" file.flush()\n",
" write_rows_list = []\n",
" \n",
" if write_rows_list:\n",
" writer.writerows(write_rows_list)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" imageFileName | \n",
" audioCounts | \n",
" total | \n",
" state | \n",
" district | \n",
" assertLanguage | \n",
" audio_urls | \n",
" Agariya | \n",
" Angika | \n",
" ... | \n",
" Sadri | \n",
" Santali | \n",
" Shekhawati | \n",
" Surgujia | \n",
" Surjapuri | \n",
" Tamil | \n",
" Telugu | \n",
" Tulu | \n",
" Urdu | \n",
" Wagdi | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 52 | \n",
" 52 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 45, 'Hindi': 7} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 41 | \n",
" 93 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 35, 'Hindi': 6} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 53 | \n",
" 146 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 45, 'Hindi': 6, 'Bengali': 2} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 38 | \n",
" 184 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 32, 'Hindi': 5, 'Urdu': 1} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/An... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 57 | \n",
" 241 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 48, 'Hindi': 7, 'Urdu': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/An... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 128802 | \n",
" 128803 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 35 | \n",
" 9584829 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 35} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/V... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 128803 | \n",
" 128804 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 19 | \n",
" 9584848 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 16, 'Hindi': 3} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/V... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 128804 | \n",
" 128805 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 17 | \n",
" 9584865 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 15, 'Hindi': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 128805 | \n",
" 128806 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 31 | \n",
" 9584896 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 31} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/V... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 128806 | \n",
" 128807 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 36 | \n",
" 9584932 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 35, 'Hindi': 1} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/V... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
128807 rows x 62 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"imageBY = pd.read_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/imageBY3.csv\")\n",
"\n",
"imageBY"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([\"['AndhraPradesh']\", \"['Bihar']\", \"['Maharashtra']\",\n",
" \"['Chhattisgarh']\", \"['Karnataka']\", \"['UttarPradesh']\",\n",
" \"['Rajasthan']\", \"['WestBengal']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'WestBengal', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa', 'Rajasthan', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Bihar', 'Jharkhand', 'Goa', 'Rajasthan', 'Uttarakhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'Maharashtra', 'WestBengal', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Goa', 'Uttarakhand', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Goa']\",\n",
" \"['Karnataka', 'Rajasthan', 'AndhraPradesh', 'Bihar', 'UttarPradesh', 'Maharashtra', 'WestBengal', 'Uttarakhand', 'Chhattisgarh', 'Jharkhand', 'Goa', 'Telangana']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa', 'Maharashtra']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Jharkhand', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa', 'Maharashtra']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Goa', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'WestBengal', 'Jharkhand', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'Jharkhand', 'WestBengal']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Goa', 'Bihar', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Uttarakhand']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Bihar', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Bihar', 'Rajasthan', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Maharashtra', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa', 'Uttarakhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Goa', 'Rajasthan', 'WestBengal', 'Uttarakhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'WestBengal', 'Uttarakhand', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Goa', 'Rajasthan', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'WestBengal', 'Goa', 'Jharkhand', 'Rajasthan', 'Uttarakhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'UttarPradesh', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Maharashtra', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Goa', 'Jharkhand', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'WestBengal', 'Maharashtra', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Rajasthan', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Goa', 'Jharkhand', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'Rajasthan', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'WestBengal', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Goa', 'Rajasthan', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Maharashtra']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Goa', 'Jharkhand']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Goa', 'Rajasthan', 'WestBengal']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Bihar', 'Rajasthan', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Goa', 'Rajasthan', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'Uttarakhand', 'Bihar', 'UttarPradesh', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Goa', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Jharkhand', 'UttarPradesh', 'Goa', 'Rajasthan', 'WestBengal']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Rajasthan', 'WestBengal', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'WestBengal', 'Goa', 'Rajasthan', 'UttarPradesh']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Jharkhand', 'WestBengal', 'Rajasthan', 'UttarPradesh', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Karnataka', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Jharkhand', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Bihar', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa', 'Jharkhand', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa', 'WestBengal', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Goa', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'Goa', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa', 'UttarPradesh', 'Uttarakhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Maharashtra', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Goa', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'WestBengal', 'Rajasthan', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Rajasthan', 'Goa', 'Uttarakhand', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Goa', 'Bihar', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'WestBengal', 'Rajasthan']\",\n",
" \"['Karnataka', 'Rajasthan', 'AndhraPradesh', 'Bihar', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'WestBengal', 'Chhattisgarh', 'Goa', 'Jharkhand', 'Telangana']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Goa', 'Rajasthan', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Rajasthan', 'Goa', 'WestBengal', 'Uttarakhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Goa', 'WestBengal', 'Uttarakhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'Bihar', 'WestBengal', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Goa', 'Jharkhand', 'WestBengal', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Maharashtra', 'WestBengal', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Rajasthan', 'WestBengal']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Goa', 'Maharashtra', 'Jharkhand']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa', 'WestBengal']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'Maharashtra', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'UttarPradesh', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Goa', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'WestBengal', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Telangana', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Goa', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Maharashtra', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Telangana', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Bihar', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Goa', 'Jharkhand']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Bihar', 'Jharkhand', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa', 'Rajasthan', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Maharashtra', 'Uttarakhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Jharkhand', 'Telangana', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'Jharkhand', 'WestBengal', 'Uttarakhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Maharashtra', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Telangana', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Telangana', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Maharashtra', 'Goa', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Maharashtra']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Maharashtra', 'Goa', 'Jharkhand', 'Rajasthan', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Telangana', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Goa', 'Jharkhand', 'Rajasthan', 'Uttarakhand']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'WestBengal', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Goa', 'Bihar', 'WestBengal', 'Jharkhand', 'Rajasthan']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Jharkhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'UttarPradesh', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Bihar', 'Uttarakhand', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Maharashtra', 'Goa', 'Jharkhand', 'WestBengal']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Goa', 'Rajasthan', 'UttarPradesh', 'WestBengal']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Rajasthan', 'Goa']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'Jharkhand', 'Maharashtra', 'Uttarakhand', 'WestBengal', 'Goa']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Bihar', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Maharashtra', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Uttarakhand', 'Goa', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'WestBengal', 'Bihar', 'Jharkhand', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Rajasthan', 'Goa', 'Jharkhand']\",\n",
" \"['Chhattisgarh', 'Telangana', 'Jharkhand', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa', 'Bihar', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'Rajasthan', 'Jharkhand', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Goa', 'WestBengal', 'Jharkhand']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Jharkhand', 'WestBengal', 'Uttarakhand', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Goa', 'Jharkhand', 'Rajasthan', 'WestBengal']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa', 'Jharkhand']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Uttarakhand', 'Jharkhand', 'Goa', 'Rajasthan']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Bihar', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Maharashtra', 'Goa']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'WestBengal', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Goa', 'WestBengal', 'Rajasthan']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'Rajasthan', 'Uttarakhand', 'Goa', 'WestBengal']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Maharashtra', 'Jharkhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Rajasthan', 'Goa', 'WestBengal']\",\n",
" \"['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Bihar', 'Rajasthan', 'Goa', 'WestBengal', 'Uttarakhand', 'Jharkhand']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Rajasthan', 'Uttarakhand', 'Goa']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'Goa', 'WestBengal', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'Jharkhand', 'WestBengal', 'Uttarakhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'Jharkhand', 'Uttarakhand', 'WestBengal']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Goa', 'Uttarakhand', 'WestBengal', 'Jharkhand']\",\n",
" \"['Maharashtra', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Uttarakhand', 'Bihar', 'Jharkhand', 'WestBengal', 'Goa']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Jharkhand', 'WestBengal', 'Goa', 'Rajasthan', 'Uttarakhand']\",\n",
" \"['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Goa', 'Rajasthan', 'Jharkhand']\",\n",
" \"['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Rajasthan', 'Goa', 'Jharkhand', 'WestBengal']\",\n",
" \"['Jharkhand']\", \"['Telangana']\", \"['Goa']\",\n",
" \"['Bihar', 'Uttarakhand']\", \"['Uttarakhand']\"], dtype=object)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"imageBY.state.unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"({1}, {1})"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# state_len = set()\n",
"# district_len = set()\n",
"# for state, district in zip(imageBY.state.values, imageBY.district.values):\n",
"# state = eval(state)\n",
"# district = eval(district)\n",
"# state_len.add(len(state))\n",
"# district_len.add(len(district))\n",
"# state_len, district_len"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# imageBY.state = imageBY.state.apply(lambda x: eval(x)[0])\n",
"# imageBY.district = imageBY.district.apply(lambda x: eval(x)[0])\n",
"# imageBY.to_csv(\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/imageBY3.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" imageFileName | \n",
" audioCounts | \n",
" total | \n",
" state | \n",
" district | \n",
" assertLanguage | \n",
" audio_urls | \n",
" Agariya | \n",
" Angika | \n",
" ... | \n",
" Sadri | \n",
" Santali | \n",
" Shekhawati | \n",
" Surgujia | \n",
" Surjapuri | \n",
" Tamil | \n",
" Telugu | \n",
" Tulu | \n",
" Urdu | \n",
" Wagdi | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 52 | \n",
" 52 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 45, 'Hindi': 7} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 41 | \n",
" 93 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 35, 'Hindi': 6} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 53 | \n",
" 146 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 45, 'Hindi': 6, 'Bengali': 2} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 38 | \n",
" 184 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 32, 'Hindi': 5, 'Urdu': 1} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/An... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 57 | \n",
" 241 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 48, 'Hindi': 7, 'Urdu': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/An... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 128798 | \n",
" 128799 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 32 | \n",
" 9584687 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 29, 'Hindi': 3} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 128799 | \n",
" 128800 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 35 | \n",
" 9584722 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 33, 'Hindi': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 128803 | \n",
" 128804 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 19 | \n",
" 9584848 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 16, 'Hindi': 3} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/V... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 128804 | \n",
" 128805 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 17 | \n",
" 9584865 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 15, 'Hindi': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 128806 | \n",
" 128807 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 36 | \n",
" 9584932 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 35, 'Hindi': 1} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/V... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
105940 rows x 62 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"imageBY.loc[imageBY.Hindi == 1]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(82.24708284487645, 0.3633342908382308, 81.99010923319385)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Hindi = 105940*100/128807\n",
"English = 468*100/128807\n",
"HindiNotEnglish = 105609*100/128807\n",
"Hindi, English, HindiNotEnglish"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" imageFileName | \n",
" audioCounts | \n",
" total | \n",
" state | \n",
" district | \n",
" assertLanguage | \n",
" audio_urls | \n",
" Agariya | \n",
" Angika | \n",
" ... | \n",
" Sadri | \n",
" Santali | \n",
" Shekhawati | \n",
" Surgujia | \n",
" Surjapuri | \n",
" Tamil | \n",
" Telugu | \n",
" Tulu | \n",
" Urdu | \n",
" Wagdi | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 52 | \n",
" 52 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 45, 'Hindi': 7} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 41 | \n",
" 93 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 35, 'Hindi': 6} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 53 | \n",
" 146 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 45, 'Hindi': 6, 'Bengali': 2} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/A... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 38 | \n",
" 184 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 32, 'Hindi': 5, 'Urdu': 1} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/An... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 57 | \n",
" 241 | \n",
" ['AndhraPradesh'] | \n",
" ['Anantpur'] | \n",
" {'Telugu': 48, 'Hindi': 7, 'Urdu': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/An... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 105604 | \n",
" 128799 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 32 | \n",
" 9584687 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 29, 'Hindi': 3} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 105605 | \n",
" 128800 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 35 | \n",
" 9584722 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 33, 'Hindi': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 105606 | \n",
" 128804 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 19 | \n",
" 9584848 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 16, 'Hindi': 3} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/V... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 105607 | \n",
" 128805 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 17 | \n",
" 9584865 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 15, 'Hindi': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 105608 | \n",
" 128807 | \n",
" Images/IISc_VaaniProject_Vishakapattanam-SPECI... | \n",
" 36 | \n",
" 9584932 | \n",
" ['AndhraPradesh'] | \n",
" ['Vishakapattanam'] | \n",
" {'Telugu': 35, 'Hindi': 1} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/V... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
105609 rows x 62 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HindiNotEnglish_df = imageBY.loc[(imageBY.English != 1) & (imageBY.Hindi == 1)].reset_index(drop=True)\n",
"HindiNotEnglish_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Hindi_district = [\n",
" 'Delhi_NewDelhi', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Katni',\n",
" 'Chhattisgarh_Bilaspur', 'Maharashtra_Nagpur', 'UttarPradesh_Varanasi', \n",
" 'UttarPradesh_Lucknow', 'UttarPradesh_Gorakhpur'\n",
"]\n",
"Hindi_district = [i.split(\"_\")[-1] for i in Hindi_district]\n",
"print(f\"['{Hindi_district[1]}']\")\n",
"\n",
"for i, row in tqdm(HindiNotEnglish_df.iterrows(), colour='blue', total=HindiNotEnglish_df.shape[0], ncols=70):\n",
" if i==1000:continue\n",
" row_districts = eval(row['district'])\n",
" print(Hindi_district[0], row_districts, Hindi_district[0] in row_districts)\n",
" # if Hindi_district[0] in row_districts:\n",
" # print(row)\n",
" # break\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" file_name | \n",
" file_url | \n",
" metadata | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" 3 | \n",
" 5 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 9584927 | \n",
" 9696433 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
" 9584928 | \n",
" 9696434 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
" 9584929 | \n",
" 9696435 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
" 9584930 | \n",
" 9696436 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
" 9584931 | \n",
" 9696437 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
"
\n",
"
9584932 rows × 4 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"JSON_PATH = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Vaani_IIsc_Artpark_Full_Data.json\"\n",
"\n",
"jsondf = pd.read_json(JSON_PATH)\n",
"jsondf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"metadata_cols = [\n",
" 'assertLanguage',\n",
" 'languagesSpoken',\n",
" 'state',\n",
" 'district',\n",
" 'gender',\n",
" 'audioFileName',\n",
" 'imageFileName',\n",
" 'pincode',\n",
" 'speakerImageHash',\n",
" ]\n",
"\n",
"meta_df = pd.json_normalize(jsondf['metadata'])\n",
"meta_df = meta_df[metadata_cols]\n",
"finalMETA = pd.concat([jsondf.drop(columns=['metadata']), meta_df], axis=1)\n",
"finalMETA"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Parquet',\n",
" 'English',\n",
" 'train-00000-of-00057.parquet',\n",
" 'NewDelhi_train-00000-of-00054.parquet']"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"os.listdir(\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" audio | \n",
" language | \n",
" languagesKnown | \n",
" gender | \n",
" state | \n",
" district | \n",
" pincode | \n",
" stay(years) | \n",
" isTranscriptionAvailable | \n",
" transcript | \n",
" referenceImage | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" {'bytes': b'RIFF\\xd2\\x04\\x01\\x00WAVEfmt \\x10\\x... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110004 | \n",
" NewDelhi(20) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_GENERIC_0473.jpg | \n",
"
\n",
" \n",
" 1 | \n",
" {'bytes': b'RIFF\\xb0h\\x01\\x00WAVEfmt \\x10\\x00\\... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110001 | \n",
" NewDelhi(19) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_GENERIC_1011.jpg | \n",
"
\n",
" \n",
" 2 | \n",
" {'bytes': b'RIFF\\x84+\\x01\\x00WAVEfmt \\x10\\x00\\... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110067 | \n",
" NewDelhi(11) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015... | \n",
"
\n",
" \n",
" 3 | \n",
" {'bytes': b'RIFF2\\xd3\\x01\\x00WAVEfmt \\x10\\x00\\... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Male | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110001 | \n",
" NewDelhi(24) | \n",
" Yes | \n",
" ऐच_डी_ऐफ_सी बैंक {H_D_F_C bank} का और उसमे एक ... | \n",
" Images/IISc_VaaniProject_GENERIC_0418.jpg | \n",
"
\n",
" \n",
" 4 | \n",
" {'bytes': b'RIFF\\xe45\\x01\\x00WAVEfmt \\x10\\x00\\... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Male | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110023 | \n",
" NewDelhi(20) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_GENERIC_0851.jpg | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 3035 | \n",
" {'bytes': b'RIFF\\xbch\\x02\\x00WAVEfmt \\x10\\x00\\... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110038 | \n",
" NewDelhi(20) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_GENERIC_0193.jpg | \n",
"
\n",
" \n",
" 3036 | \n",
" {'bytes': b'RIFFJD\\x01\\x00WAVEfmt \\x10\\x00\\x00... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110028 | \n",
" NewDelhi(20) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_GENERIC_0249.jpg | \n",
"
\n",
" \n",
" 3037 | \n",
" {'bytes': b'RIFF\\x12\\x02\\x01\\x00WAVEfmt \\x10\\x... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110023 | \n",
" NewDelhi(20) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_GENERIC_1268.jpg | \n",
"
\n",
" \n",
" 3038 | \n",
" {'bytes': b'RIFFz\\xbc\\x01\\x00WAVEfmt \\x10\\x00\\... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110011 | \n",
" NewDelhi(23) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_NewDelhi-SPECIFIC_016... | \n",
"
\n",
" \n",
" 3039 | \n",
" {'bytes': b'RIFF\\xc8\\xa4\\x01\\x00WAVEfmt \\x10\\x... | \n",
" Hindi | \n",
" ['Hindi'] | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
" 110011 | \n",
" NewDelhi(23) | \n",
" No | \n",
" None | \n",
" Images/IISc_VaaniProject_NewDelhi-SPECIFIC_012... | \n",
"
\n",
" \n",
"
\n",
"
3040 rows x 11 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"parquet_df = pd.read_parquet(\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/NewDelhi_train-00000-of-00054.parquet\")\n",
"parquet_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Images/IISc_VaaniProject_Anantpur-SPECIFIC_00001.jpg'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HindiNotEnglish_df.imageFileName[0]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" file_name | \n",
" file_url | \n",
" assertLanguage | \n",
" languagesSpoken | \n",
" state | \n",
" district | \n",
" gender | \n",
" audioFileName | \n",
" imageFileName | \n",
" pincode | \n",
" speakerImageHash | \n",
"
\n",
" \n",
" \n",
" \n",
" 135079 | \n",
" 135081 | \n",
" IISc_VaaniProject_S_AP_Anantpur_113390_1167518... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu, Hindi] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515556 | \n",
" (jo}v(qk +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 135088 | \n",
" 135090 | \n",
" IISc_VaaniProject_S_AP_Anantpur_113390_1167518... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu, Hindi] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515556 | \n",
" (jo}v(qk +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 408373 | \n",
" 408532 | \n",
" IISc_VaaniProject_S_AP_Anantpur_94940_10873932... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Hindi | \n",
" [Hindi, Urdu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515763 | \n",
" (iq}t0lk +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 792425 | \n",
" 792662 | \n",
" IISc_VaaniProject_S_AP_Anantpur_112124_1158582... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515672 | \n",
" (jn~v/km +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 792436 | \n",
" 792673 | \n",
" IISc_VaaniProject_S_AP_Anantpur_112124_1158582... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515672 | \n",
" (jn~v/km +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 1051890 | \n",
" 1052127 | \n",
" IISc_VaaniProject_S_AP_Anantpur_107463_1139557... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515134 | \n",
" (jl(v,pq +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 1051902 | \n",
" 1052139 | \n",
" IISc_VaaniProject_S_AP_Anantpur_107463_1139557... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515134 | \n",
" (jl(v,pq +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 1153317 | \n",
" 1153554 | \n",
" IISc_VaaniProject_S_AP_Anantpur_104144_1121525... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515812 | \n",
" (jkwv)nk +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 1153348 | \n",
" 1153585 | \n",
" IISc_VaaniProject_S_AP_Anantpur_104144_1121525... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515812 | \n",
" (jkwv)nk +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 1453323 | \n",
" 1453709 | \n",
" IISc_VaaniProject_S_AP_Anantpur_121072_1220106... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Hindi | \n",
" [English, Hindi] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515133 | \n",
" (kkvr~ok +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 1453420 | \n",
" 1453806 | \n",
" IISc_VaaniProject_S_AP_Anantpur_121072_1220106... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Hindi | \n",
" [English, Hindi] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515133 | \n",
" (kkvr~ok +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 1455874 | \n",
" 1456260 | \n",
" IISc_VaaniProject_S_AP_Anantpur_116830_1187130... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Hindi, Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515631 | \n",
" (jq}r*im +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 1456082 | \n",
" 1456468 | \n",
" IISc_VaaniProject_S_AP_Anantpur_116830_1187130... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Hindi, Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515631 | \n",
" (jq}r*im +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 4083091 | \n",
" 4084419 | \n",
" IISc_VaaniProject_S_AP_Anantpur_155564_1387714... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Hindi | \n",
" [Hindi, Malayalam] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515401 | \n",
" (lq}x(mo +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 4083133 | \n",
" 4084461 | \n",
" IISc_VaaniProject_S_AP_Anantpur_155564_1387714... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Hindi | \n",
" [Hindi, Malayalam] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515401 | \n",
" (lq}x(mo +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 4084038 | \n",
" 4085366 | \n",
" IISc_VaaniProject_S_AP_Anantpur_118450_1192469... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Hindi | \n",
" [Hindi] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515867 | \n",
" (jrxu-ro +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 4084072 | \n",
" 4085400 | \n",
" IISc_VaaniProject_S_AP_Anantpur_118450_1192469... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Hindi | \n",
" [Hindi] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515867 | \n",
" (jrxu-ro +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 4910440 | \n",
" 4913075 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Pras22443_1156... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu, Hindi, Kannada] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515154 | \n",
" (jn|v,iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4910475 | \n",
" 4913110 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Pras22443_1156... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu, Hindi, Kannada] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515154 | \n",
" (jn|v,iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4915515 | \n",
" 4918150 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Push20666_1748... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515812 | \n",
" (pm~u*iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4915529 | \n",
" 4918164 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Push20666_1748... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515812 | \n",
" (pm~u*iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4916177 | \n",
" 4918812 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Chan92399_0731... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515671 | \n",
" ~plwt.iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4916484 | \n",
" 4919119 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Chan92399_0731... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515671 | \n",
" ~plwt.iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4916671 | \n",
" 4919306 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Chan92399_0731... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515671 | \n",
" ~plwt.iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4916687 | \n",
" 4919322 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Chan92399_0731... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515671 | \n",
" ~plwt.iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4949329 | \n",
" 4951964 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare62582_1417... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515671 | \n",
" (mj}u0iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4949401 | \n",
" 4952036 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare62582_1417... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515671 | \n",
" (mj}u0iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4949537 | \n",
" 4952172 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare62582_1417... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515671 | \n",
" (mj}u0iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 4972181 | \n",
" 4974816 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Prud35888_1142... | \n",
" https://vaani.iisc.ac.in/Audios/Anantpur/IISc_... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" Audios/Anantpur/IISc_VaaniProject_M_AP_Anantpu... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515556 | \n",
" (jmxu*iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6025639 | \n",
" 6137143 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515435 | \n",
" ~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6025732 | \n",
" 6137236 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515435 | \n",
" ~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6025739 | \n",
" 6137243 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515435 | \n",
" ~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6025744 | \n",
" 6137248 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515435 | \n",
" ~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6025761 | \n",
" 6137265 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Priy61103_0633... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515435 | \n",
" ~olyr/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6382199 | \n",
" 6493703 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Pill40436_0912... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515501 | \n",
" ~rjxv~iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6382200 | \n",
" 6493704 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Pill40436_0912... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515501 | \n",
" ~rjxv~iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6382208 | \n",
" 6493712 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Pill40436_0912... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515501 | \n",
" ~rjxv~iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6570311 | \n",
" 6681815 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, Hindi, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515261 | \n",
" (oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6570316 | \n",
" 6681820 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, Hindi, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515261 | \n",
" (oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6570317 | \n",
" 6681821 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, Hindi, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515261 | \n",
" (oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6570319 | \n",
" 6681823 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, Hindi, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515261 | \n",
" (oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6570323 | \n",
" 6681827 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, Hindi, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515261 | \n",
" (oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6570329 | \n",
" 6681833 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, Hindi, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515261 | \n",
" (oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 6570330 | \n",
" 6681834 | \n",
" IISc_VaaniProject_M_AP_Anantpur_Nare88751_1619... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, Hindi, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515261 | \n",
" (oj(s/iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 7385670 | \n",
" 7497174 | \n",
" IISc_VaaniProject_M_AP_Anantpur_BODA75520_1956... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515425 | \n",
" (rn|s+iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 7385713 | \n",
" 7497217 | \n",
" IISc_VaaniProject_M_AP_Anantpur_BODA75520_1956... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu, English] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_M_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515425 | \n",
" (rn|s+iivq zPPXkRWanJ2~23=+|Nq~iiw | \n",
"
\n",
" \n",
" 7685178 | \n",
" 7796682 | \n",
" IISc_VaaniProject_S_AP_Anantpur_119578_1197153... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Hindi, Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515281 | \n",
" (jr}r,lm +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 7717190 | \n",
" 7828694 | \n",
" IISc_VaaniProject_S_AP_Anantpur_119578_1197153... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Hindi, Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" male | \n",
" /Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515281 | \n",
" (jr}r,lm +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 9238620 | \n",
" 9350126 | \n",
" IISc_VaaniProject_S_AP_Anantpur_109401_1144986... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515001 | \n",
" (jmzz/oi +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 9238631 | \n",
" 9350137 | \n",
" IISc_VaaniProject_S_AP_Anantpur_109401_1144986... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515001 | \n",
" (jmzz/oi +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 9239076 | \n",
" 9350582 | \n",
" IISc_VaaniProject_S_AP_Anantpur_88704_10558507... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515425 | \n",
" (in{y,ip +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
" 9239078 | \n",
" 9350584 | \n",
" IISc_VaaniProject_S_AP_Anantpur_88704_10558507... | \n",
" https://vaani.iisc.ac.in//Audios/Anantpur/IISc... | \n",
" Telugu | \n",
" [Telugu] | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" /Audios/Anantpur/IISc_VaaniProject_S_AP_Anantp... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_000... | \n",
" 515425 | \n",
" (in{y,ip +eCPcZlTfB:<|+53:Aivq~j | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"finalMETA.loc[finalMETA.imageFileName == HindiNotEnglish_df.imageFileName[0]]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" imageFileName | \n",
" audioCounts | \n",
" total | \n",
" state | \n",
" district | \n",
" assertLanguage | \n",
" audio_urls | \n",
" Agariya | \n",
" Angika | \n",
" ... | \n",
" Sadri | \n",
" Santali | \n",
" Shekhawati | \n",
" Surgujia | \n",
" Surjapuri | \n",
" Tamil | \n",
" Telugu | \n",
" Tulu | \n",
" Urdu | \n",
" Wagdi | \n",
"
\n",
" \n",
" \n",
" \n",
" 10529 | \n",
" 10530 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0023... | \n",
" 46 | \n",
" 417891 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 40, 'Hindi': 2, 'Telugu': 2, 'Engl... | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Be... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 10580 | \n",
" 10581 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0028... | \n",
" 28 | \n",
" 419513 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 17, 'Telugu': 7, 'Hindi': 2, 'Engl... | \n",
" {'Kannada': ['https://vaani.iisc.ac.in/Audios/... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 10855 | \n",
" 10856 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0055... | \n",
" 40 | \n",
" 428720 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 25, 'Telugu': 11, 'English': 2, 'H... | \n",
" {'Kannada': ['https://vaani.iisc.ac.in/Audios/... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 10862 | \n",
" 10863 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0056... | \n",
" 37 | \n",
" 428927 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 22, 'Telugu': 11, 'Hindi': 2, 'Eng... | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/B... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 10942 | \n",
" 10943 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0064... | \n",
" 39 | \n",
" 431624 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 31, 'Telugu': 4, 'English': 2, 'Hi... | \n",
" {'Kannada': ['https://vaani.iisc.ac.in/Audios/... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 70864 | \n",
" 70865 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0029... | \n",
" 28 | \n",
" 7336979 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 21, 'Hindi': 4, 'English': 3} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 71153 | \n",
" 71154 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0057... | \n",
" 34 | \n",
" 7346921 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 29, 'Hindi': 3, 'English': 2} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/K... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 71161 | \n",
" 71162 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0058... | \n",
" 39 | \n",
" 7347205 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 32, 'Hindi': 5, 'English': 2} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 71665 | \n",
" 71666 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0109... | \n",
" 41 | \n",
" 7364827 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 36, 'English': 3, 'Hindi': 2} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/K... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 71729 | \n",
" 71730 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0115... | \n",
" 32 | \n",
" 7367046 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 25, 'Hindi': 4, 'English': 3} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
331 rows x 62 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"imageBY.loc[(imageBY.English == 1) & (imageBY.Hindi == 1)]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" imageFileName | \n",
" audioCounts | \n",
" total | \n",
" state | \n",
" district | \n",
" assertLanguage | \n",
" audio_urls | \n",
" Agariya | \n",
" Angika | \n",
" ... | \n",
" Sadri | \n",
" Santali | \n",
" Shekhawati | \n",
" Surgujia | \n",
" Surjapuri | \n",
" Tamil | \n",
" Telugu | \n",
" Tulu | \n",
" Urdu | \n",
" Wagdi | \n",
"
\n",
" \n",
" \n",
" \n",
" 10309 | \n",
" 10310 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0001... | \n",
" 41 | \n",
" 410798 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 20, 'Telugu': 13, 'Urdu': 4, 'Bear... | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/B... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 10322 | \n",
" 10323 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0002... | \n",
" 33 | \n",
" 411256 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 25, 'Telugu': 5, 'English': 2, 'Be... | \n",
" {'Kannada': ['https://vaani.iisc.ac.in/Audios/... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 10341 | \n",
" 10342 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0004... | \n",
" 40 | \n",
" 411887 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 33, 'Telugu': 5, 'English': 2} | \n",
" {'Kannada': ['https://vaani.iisc.ac.in/Audios/... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 10394 | \n",
" 10395 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0009... | \n",
" 34 | \n",
" 413709 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 28, 'Telugu': 4, 'English': 2} | \n",
" {'Kannada': ['https://vaani.iisc.ac.in/Audios/... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 10508 | \n",
" 10509 | \n",
" Images/IISc_VaaniProject_Bellary-SPECIFIC_0020... | \n",
" 42 | \n",
" 417224 | \n",
" ['Karnataka'] | \n",
" ['Bellary'] | \n",
" {'Kannada': 31, 'Telugu': 9, 'English': 2} | \n",
" {'Kannada': ['https://vaani.iisc.ac.in/Audios/... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 71275 | \n",
" 71276 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0070... | \n",
" 28 | \n",
" 7351129 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 25, 'English': 3} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/K... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 71372 | \n",
" 71373 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0080... | \n",
" 31 | \n",
" 7354457 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 29, 'English': 2} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/K... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 71650 | \n",
" 71651 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0107... | \n",
" 34 | \n",
" 7364295 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 32, 'English': 2} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/K... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 71665 | \n",
" 71666 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0109... | \n",
" 41 | \n",
" 7364827 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 36, 'English': 3, 'Hindi': 2} | \n",
" {'Telugu': ['https://vaani.iisc.ac.in/Audios/K... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 71729 | \n",
" 71730 | \n",
" Images/IISc_VaaniProject_Krishna-SPECIFIC_0115... | \n",
" 32 | \n",
" 7367046 | \n",
" ['AndhraPradesh'] | \n",
" ['Krishna'] | \n",
" {'Telugu': 25, 'Hindi': 4, 'English': 3} | \n",
" {'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr... | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
468 rows x 62 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"imageBY.loc[imageBY.English == 1]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 {'Telugu': 45, 'Hindi': 7}\n",
"1 {'Telugu': 35, 'Hindi': 6}\n",
"2 {'Telugu': 45, 'Hindi': 6, 'Bengali': 2}\n",
"3 {'Telugu': 32, 'Hindi': 5, 'Urdu': 1}\n",
"4 {'Telugu': 48, 'Hindi': 7, 'Urdu': 2}\n",
" ... \n",
"128802 {'Telugu': 35}\n",
"128803 {'Telugu': 16, 'Hindi': 3}\n",
"128804 {'Telugu': 15, 'Hindi': 2}\n",
"128805 {'Telugu': 31}\n",
"128806 {'Telugu': 35, 'Hindi': 1}\n",
"Name: assertLanguage, Length: 128807, dtype: object"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"imageBY.assertLanguage"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# imageBY_data = {'imageFileName': [], 'state': [], 'district': [],}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Image - Audio(Hindi) for CSIP"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Original JSON is not updated with Delhi"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"# import fireducks.pandas as pd\n",
"from tqdm import tqdm, trange\n",
"\n",
"HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n",
"IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\"\n",
"JSON_PATH = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Vaani_IIsc_Artpark_Full_Data.json\""
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" file_name | \n",
" file_url | \n",
" metadata | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" 3 | \n",
" 5 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 9584927 | \n",
" 9696433 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
" 9584928 | \n",
" 9696434 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
" 9584929 | \n",
" 9696435 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
" 9584930 | \n",
" 9696436 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
" 9584931 | \n",
" 9696437 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
"
\n",
" \n",
"
\n",
"
9584932 rows × 4 columns
\n",
"
"
],
"text/plain": [
" id file_name \\\n",
"0 2 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"1 3 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"2 4 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"3 5 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"4 6 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"... ... ... \n",
"9584927 9696433 IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... \n",
"9584928 9696434 IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... \n",
"9584929 9696435 IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... \n",
"9584930 9696436 IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... \n",
"9584931 9696437 IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... \n",
"\n",
" file_url \\\n",
"0 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"1 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"2 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"3 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"4 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"... ... \n",
"9584927 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"9584928 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"9584929 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"9584930 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"9584931 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"\n",
" metadata \n",
"0 {'state': 'Maharashtra', 'gender': 'female', '... \n",
"1 {'state': 'Maharashtra', 'gender': 'female', '... \n",
"2 {'state': 'Maharashtra', 'gender': 'female', '... \n",
"3 {'state': 'Maharashtra', 'gender': 'female', '... \n",
"4 {'state': 'Maharashtra', 'gender': 'female', '... \n",
"... ... \n",
"9584927 {'state': 'Karnataka', 'gender': 'female', 'pi... \n",
"9584928 {'state': 'Karnataka', 'gender': 'female', 'pi... \n",
"9584929 {'state': 'Karnataka', 'gender': 'female', 'pi... \n",
"9584930 {'state': 'Karnataka', 'gender': 'female', 'pi... \n",
"9584931 {'state': 'Karnataka', 'gender': 'female', 'pi... \n",
"\n",
"[9584932 rows x 4 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jsondf = pd.read_json(JSON_PATH)\n",
"jsondf"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 9584932/9584932 [00:02<00:00, 4578604.24it/s]\n"
]
}
],
"source": [
"for i in tqdm(jsondf.file_name.values):\n",
" # if 'Delhi' in i:\n",
" if i == 'IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav':\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" file_name | \n",
" file_url | \n",
" metadata | \n",
" assertLanguage | \n",
" audioFileName | \n",
" audioManualQC | \n",
" audioQualityCheck | \n",
" district | \n",
" gender | \n",
" ... | \n",
" imageFaceData.face_90.facial_area | \n",
" imageFaceData.face_91.facial_area | \n",
" imageFaceData.face_92.facial_area | \n",
" imageFaceData.face_93.facial_area | \n",
" imageFaceData.face_94.facial_area | \n",
" imageFaceData.face_95.facial_area | \n",
" imageFaceData.face_96.facial_area | \n",
" imageFaceData.face_97.facial_area | \n",
" imageFaceData.face_98.facial_area | \n",
" imageFaceData.face_99.facial_area | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 5 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'state': 'Maharashtra', 'gender': 'female', '... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated&Manual | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 9584927 | \n",
" 9696433 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 9584928 | \n",
" 9696434 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated&Manual | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 9584929 | \n",
" 9696435 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 9584930 | \n",
" 9696436 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 9584931 | \n",
" 9696437 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'state': 'Karnataka', 'gender': 'female', 'pi... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
9584932 rows × 251 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fullJSON = pd.concat([jsondf, pd.json_normalize(jsondf.metadata)], axis=1)\n",
"fullJSON"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# fullJSON.to_parquet(\"Vaani-Images-Audio-JSON.parquet\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" file_name | \n",
" file_url | \n",
" metadata | \n",
" assertLanguage | \n",
" audioFileName | \n",
" audioManualQC | \n",
" audioQualityCheck | \n",
" district | \n",
" gender | \n",
" ... | \n",
" imageFaceData.face_90.facial_area | \n",
" imageFaceData.face_91.facial_area | \n",
" imageFaceData.face_92.facial_area | \n",
" imageFaceData.face_93.facial_area | \n",
" imageFaceData.face_94.facial_area | \n",
" imageFaceData.face_95.facial_area | \n",
" imageFaceData.face_96.facial_area | \n",
" imageFaceData.face_97.facial_area | \n",
" imageFaceData.face_98.facial_area | \n",
" imageFaceData.face_99.facial_area | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'assertLanguage': 'Marathi', 'audioFileName':... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'assertLanguage': 'Marathi', 'audioFileName':... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'assertLanguage': 'Marathi', 'audioFileName':... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" 5 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'assertLanguage': 'Marathi', 'audioFileName':... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" IISc_VaaniProject_S_Maharashtra_Aurangabad_952... | \n",
" https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... | \n",
" {'assertLanguage': 'Marathi', 'audioFileName':... | \n",
" Marathi | \n",
" Audios/Aurangabad/IISc_VaaniProject_S_Maharash... | \n",
" NaN | \n",
" Automated&Manual | \n",
" Aurangabad | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 9584927 | \n",
" 9696433 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'assertLanguage': 'Kannada', 'audioFileName':... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 9584928 | \n",
" 9696434 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'assertLanguage': 'Kannada', 'audioFileName':... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated&Manual | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 9584929 | \n",
" 9696435 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'assertLanguage': 'Kannada', 'audioFileName':... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 9584930 | \n",
" 9696436 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'assertLanguage': 'Kannada', 'audioFileName':... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 9584931 | \n",
" 9696437 | \n",
" IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... | \n",
" https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... | \n",
" {'assertLanguage': 'Kannada', 'audioFileName':... | \n",
" Kannada | \n",
" Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... | \n",
" NaN | \n",
" Automated | \n",
" Chamarajanagar | \n",
" female | \n",
" ... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
9584932 rows × 251 columns
\n",
"
"
],
"text/plain": [
" id file_name \\\n",
"0 2 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"1 3 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"2 4 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"3 5 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"4 6 IISc_VaaniProject_S_Maharashtra_Aurangabad_952... \n",
"... ... ... \n",
"9584927 9696433 IISc_VaaniProject_M_KA_Chamrajn_42017276_16081... \n",
"9584928 9696434 IISc_VaaniProject_M_KA_Chamrajn_42017276_16053... \n",
"9584929 9696435 IISc_VaaniProject_M_KA_Chamrajn_42017276_12370... \n",
"9584930 9696436 IISc_VaaniProject_M_KA_Chamrajn_42017276_09272... \n",
"9584931 9696437 IISc_VaaniProject_M_KA_Chamrajn_42017276_15323... \n",
"\n",
" file_url \\\n",
"0 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"1 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"2 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"3 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"4 https://vaani.iisc.ac.in/Audios/Aurangabad/IIS... \n",
"... ... \n",
"9584927 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"9584928 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"9584929 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"9584930 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"9584931 https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_... \n",
"\n",
" metadata assertLanguage \\\n",
"0 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n",
"1 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n",
"2 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n",
"3 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n",
"4 {'assertLanguage': 'Marathi', 'audioFileName':... Marathi \n",
"... ... ... \n",
"9584927 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n",
"9584928 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n",
"9584929 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n",
"9584930 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n",
"9584931 {'assertLanguage': 'Kannada', 'audioFileName':... Kannada \n",
"\n",
" audioFileName audioManualQC \\\n",
"0 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n",
"1 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n",
"2 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n",
"3 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n",
"4 Audios/Aurangabad/IISc_VaaniProject_S_Maharash... NaN \n",
"... ... ... \n",
"9584927 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n",
"9584928 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n",
"9584929 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n",
"9584930 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n",
"9584931 Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj... NaN \n",
"\n",
" audioQualityCheck district gender ... \\\n",
"0 Automated Aurangabad female ... \n",
"1 Automated Aurangabad female ... \n",
"2 Automated Aurangabad female ... \n",
"3 Automated Aurangabad female ... \n",
"4 Automated&Manual Aurangabad female ... \n",
"... ... ... ... ... \n",
"9584927 Automated Chamarajanagar female ... \n",
"9584928 Automated&Manual Chamarajanagar female ... \n",
"9584929 Automated Chamarajanagar female ... \n",
"9584930 Automated Chamarajanagar female ... \n",
"9584931 Automated Chamarajanagar female ... \n",
"\n",
" imageFaceData.face_90.facial_area imageFaceData.face_91.facial_area \\\n",
"0 None None \n",
"1 None None \n",
"2 None None \n",
"3 None None \n",
"4 None None \n",
"... ... ... \n",
"9584927 None None \n",
"9584928 None None \n",
"9584929 None None \n",
"9584930 None None \n",
"9584931 None None \n",
"\n",
" imageFaceData.face_92.facial_area imageFaceData.face_93.facial_area \\\n",
"0 None None \n",
"1 None None \n",
"2 None None \n",
"3 None None \n",
"4 None None \n",
"... ... ... \n",
"9584927 None None \n",
"9584928 None None \n",
"9584929 None None \n",
"9584930 None None \n",
"9584931 None None \n",
"\n",
" imageFaceData.face_94.facial_area imageFaceData.face_95.facial_area \\\n",
"0 None None \n",
"1 None None \n",
"2 None None \n",
"3 None None \n",
"4 None None \n",
"... ... ... \n",
"9584927 None None \n",
"9584928 None None \n",
"9584929 None None \n",
"9584930 None None \n",
"9584931 None None \n",
"\n",
" imageFaceData.face_96.facial_area imageFaceData.face_97.facial_area \\\n",
"0 None None \n",
"1 None None \n",
"2 None None \n",
"3 None None \n",
"4 None None \n",
"... ... ... \n",
"9584927 None None \n",
"9584928 None None \n",
"9584929 None None \n",
"9584930 None None \n",
"9584931 None None \n",
"\n",
" imageFaceData.face_98.facial_area imageFaceData.face_99.facial_area \n",
"0 None None \n",
"1 None None \n",
"2 None None \n",
"3 None None \n",
"4 None None \n",
"... ... ... \n",
"9584927 None None \n",
"9584928 None None \n",
"9584929 None None \n",
"9584930 None None \n",
"9584931 None None \n",
"\n",
"[9584932 rows x 251 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fullJSON = pd.read_parquet(\"Vaani-Images-Audio-JSON.parquet\")\n",
"fullJSON"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/9584932 [00:00, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 9584932/9584932 [00:01<00:00, 6650531.75it/s]\n"
]
}
],
"source": [
"for i in tqdm(fullJSON.file_name.values):\n",
" # if 'Delhi' in i:\n",
" if i == 'IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav':\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'Delhi' in '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" file_name | \n",
" file_url | \n",
" metadata | \n",
" assertLanguage | \n",
" audioFileName | \n",
" audioManualQC | \n",
" audioQualityCheck | \n",
" district | \n",
" gender | \n",
" ... | \n",
" imageFaceData.face_90.facial_area | \n",
" imageFaceData.face_91.facial_area | \n",
" imageFaceData.face_92.facial_area | \n",
" imageFaceData.face_93.facial_area | \n",
" imageFaceData.face_94.facial_area | \n",
" imageFaceData.face_95.facial_area | \n",
" imageFaceData.face_96.facial_area | \n",
" imageFaceData.face_97.facial_area | \n",
" imageFaceData.face_98.facial_area | \n",
" imageFaceData.face_99.facial_area | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 251 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [id, file_name, file_url, metadata, assertLanguage, audioFileName, audioManualQC, audioQualityCheck, district, gender, imageFaceData, imageFileName, languagesSpoken, pincode, speakerImageHash, state, stay(years), transcript, transcriptQualityCheck, imageFaceData.face_1.facial_area, imageFaceData.face_10, imageFaceData.face_100, imageFaceData.face_101, imageFaceData.face_102, imageFaceData.face_103, imageFaceData.face_104, imageFaceData.face_105, imageFaceData.face_106, imageFaceData.face_107, imageFaceData.face_108, imageFaceData.face_109, imageFaceData.face_11, imageFaceData.face_110, imageFaceData.face_111, imageFaceData.face_12, imageFaceData.face_13, imageFaceData.face_14, imageFaceData.face_15, imageFaceData.face_16, imageFaceData.face_17, imageFaceData.face_18, imageFaceData.face_19, imageFaceData.face_2.facial_area, imageFaceData.face_20, imageFaceData.face_21, imageFaceData.face_22, imageFaceData.face_23, imageFaceData.face_24, imageFaceData.face_25, imageFaceData.face_26, imageFaceData.face_27, imageFaceData.face_28, imageFaceData.face_29, imageFaceData.face_3.facial_area, imageFaceData.face_30, imageFaceData.face_31, imageFaceData.face_32, imageFaceData.face_33, imageFaceData.face_34, imageFaceData.face_35, imageFaceData.face_36, imageFaceData.face_37, imageFaceData.face_38, imageFaceData.face_39, imageFaceData.face_4.facial_area, imageFaceData.face_40, imageFaceData.face_41, imageFaceData.face_42, imageFaceData.face_43, imageFaceData.face_44, imageFaceData.face_45, imageFaceData.face_46, imageFaceData.face_47, imageFaceData.face_48, imageFaceData.face_49, imageFaceData.face_5.facial_area, imageFaceData.face_50, imageFaceData.face_51, imageFaceData.face_52, imageFaceData.face_53, imageFaceData.face_54, imageFaceData.face_55, imageFaceData.face_56, imageFaceData.face_57, imageFaceData.face_58, imageFaceData.face_59, imageFaceData.face_6.facial_area, imageFaceData.face_60, imageFaceData.face_61, imageFaceData.face_62, imageFaceData.face_63, imageFaceData.face_64, imageFaceData.face_65, imageFaceData.face_66, imageFaceData.face_67, imageFaceData.face_68, imageFaceData.face_69, imageFaceData.face_7.facial_area, imageFaceData.face_70, imageFaceData.face_71, ...]\n",
"Index: []\n",
"\n",
"[0 rows x 251 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fullJSON.loc[fullJSON.state == 'Delhi']"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([np.int64(2),\n",
" 'IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',\n",
" 'https://vaani.iisc.ac.in/Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',\n",
" {'assertLanguage': 'Marathi', 'audioFileName': 'Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav', 'audioManualQC': None, 'audioQualityCheck': 'Automated', 'district': 'Aurangabad', 'gender': 'female', 'imageFaceData': None, 'imageFileName': 'Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg', 'languagesSpoken': array(['Marathi'], dtype=object), 'pincode': '431105', 'speakerImageHash': '(iq~v-nq +lTC]QXDCSnJ2~23=+|Nq~nn(', 'state': 'Maharashtra', 'stay(years)': 'Aurangabad(23)', 'transcript': None, 'transcriptQualityCheck': None},\n",
" 'Marathi',\n",
" 'Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',\n",
" np.float64(nan), 'Automated', 'Aurangabad', 'female',\n",
" np.float64(nan),\n",
" 'Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg',\n",
" array(['Marathi'], dtype=object), '431105',\n",
" '(iq~v-nq +lTC]QXDCSnJ2~23=+|Nq~nn(', 'Maharashtra',\n",
" 'Aurangabad(23)', None, None, None, np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), None, np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" None, np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), None,\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), None, np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), None, np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" None, np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), None,\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), None, np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), None, None, None, None, None, None, None, None,\n",
" None, None, None, np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), np.float64(nan), np.float64(nan),\n",
" np.float64(nan), np.float64(nan), None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None, None, None,\n",
" None, None, None, None, None, None, None, None, None], dtype=object)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fullJSON.iloc[0,:].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" state | \n",
" district | \n",
" gender | \n",
" assertLanguage | \n",
" file_name | \n",
" imageFileName | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Bengali | \n",
" IISc_VaaniProject_S_AP_Anantpur_100778_1106180... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_009... | \n",
"
\n",
" \n",
" 1 | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Bengali | \n",
" IISc_VaaniProject_S_AP_Anantpur_100778_1106122... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_001... | \n",
"
\n",
" \n",
" 2 | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Bengali | \n",
" IISc_VaaniProject_S_AP_Anantpur_100778_1106132... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_007... | \n",
"
\n",
" \n",
" 3 | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Bengali | \n",
" IISc_VaaniProject_S_AP_Anantpur_100778_1106123... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_010... | \n",
"
\n",
" \n",
" 4 | \n",
" AndhraPradesh | \n",
" Anantpur | \n",
" female | \n",
" Bengali | \n",
" IISc_VaaniProject_S_AP_Anantpur_100778_1106123... | \n",
" Images/IISc_VaaniProject_Anantpur-SPECIFIC_005... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 9584927 | \n",
" WestBengal | \n",
" Purulia | \n",
" male | \n",
" Santali | \n",
" IISc_VaaniProject_M_WB_Purulia_Guru45176_03235... | \n",
" Images/IISc_VaaniProject_Purulia-SPECIFIC_0165... | \n",
"
\n",
" \n",
" 9584928 | \n",
" WestBengal | \n",
" Purulia | \n",
" male | \n",
" Santali | \n",
" IISc_VaaniProject_M_WB_Purulia_Guru45176_03313... | \n",
" Images/IISc_VaaniProject_GENERIC_0839.jpg | \n",
"
\n",
" \n",
" 9584929 | \n",
" WestBengal | \n",
" Purulia | \n",
" male | \n",
" Santali | \n",
" IISc_VaaniProject_M_WB_Purulia_Guru45176_03292... | \n",
" Images/IISc_VaaniProject_GENERIC_0022.jpg | \n",
"
\n",
" \n",
" 9584930 | \n",
" WestBengal | \n",
" Purulia | \n",
" male | \n",
" Santali | \n",
" IISc_VaaniProject_M_WB_Purulia_Guru45176_03050... | \n",
" Images/IISc_VaaniProject_Purulia-SPECIFIC_0162... | \n",
"
\n",
" \n",
" 9584931 | \n",
" WestBengal | \n",
" Purulia | \n",
" male | \n",
" Santali | \n",
" IISc_VaaniProject_M_WB_Purulia_Kira23456_04200... | \n",
" Images/IISc_VaaniProject_Purulia-SPECIFIC_0069... | \n",
"
\n",
" \n",
"
\n",
"
9584932 rows x 6 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ImageAudioHindi_df = fullJSON[['state', 'district', 'gender', 'assertLanguage', 'file_name', 'imageFileName']]\n",
"ImageAudioHindi_df = ImageAudioHindi_df.sort_values(by=['state', 'district', 'assertLanguage'], \n",
" ascending=[True, True, True])\n",
"ImageAudioHindi_df = ImageAudioHindi_df.reset_index(drop=True)\n",
"# ImageAudioHindi_df.to_csv(\"Image-Audio-Hindi.csv\", index=False)\n",
"ImageAudioHindi_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Image - Audio(Hindi) Dataloaders for CSIP"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# import pandas as pd\n",
"import fireducks.pandas as pd\n",
"from tqdm import tqdm, trange\n",
"\n",
"HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n",
"IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\"\n",
"IMAGEAUDIOCSV = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv\""
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"def walkDIR(folder_path, include=None):\n",
" file_list = []\n",
" for root, _, files in os.walk(folder_path):\n",
" for file in files:\n",
" if include is None or any(file.endswith(ext) for ext in include):\n",
" file_list.append(os.path.join(root, file))\n",
" print(\"Files found:\", len(file_list))\n",
" return file_list"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Files found: 38300\n"
]
},
{
"data": {
"text/plain": [
"(128807, 38300)"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"images_files = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR) if i.endswith(\".jpg\")]\n",
"audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])\n",
"\n",
"len(images_files), len(audio_files)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_TehriGarhwal-SPECIFIC_00863.jpg',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Budaun-SPECIFIC_00129.jpg']"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"images_files[:3]"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"audio_files[:3]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani'"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.path.dirname(IMAGEDIR)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" audio_path | \n",
" referenceImage | \n",
" gender | \n",
" state | \n",
" district | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7... | \n",
" Images/IISc_VaaniProject_GENERIC_0473.jpg | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 1 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4... | \n",
" Images/IISc_VaaniProject_GENERIC_1011.jpg | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 2 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6... | \n",
" Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015... | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 3 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0... | \n",
" Images/IISc_VaaniProject_GENERIC_0418.jpg | \n",
" Male | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 4 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2... | \n",
" Images/IISc_VaaniProject_GENERIC_0851.jpg | \n",
" Male | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 38295 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38296 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38297 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38298 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38299 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01327... | \n",
" Female | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
"
\n",
"
38300 rows x 5 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)\n",
"ImageAudioHindi_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Audio Image Mapping"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"IMAGEDIR, HINDI_AUDIO_DIR"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|\u001b[33m████████████████████████████████████████████████████████\u001b[0m| 38300/38300 [00:54<00:00, 704.27it/s]\u001b[0m\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" image_path | \n",
" audio_path | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 1 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 2 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 3 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 4 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 22327 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 22328 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 22329 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 22330 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 22331 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
"
\n",
"
22332 rows × 2 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"mapping_counter = 0\n",
"available_img_audios = {\"image_path\":[], \"audio_path\":[]}\n",
"\n",
"for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):\n",
" image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)\n",
" audio_path = os.path.join(HINDI_AUDIO_DIR, f\"{row.state}_{row.district}\", row.audio_path)\n",
" \n",
" if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):\n",
" available_img_audios['image_path'].append(image_path)\n",
" available_img_audios['audio_path'].append(audio_path)\n",
"\n",
"available_img_audios_df = pd.DataFrame(available_img_audios)\n",
"available_img_audios_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# available_img_audios_df.to_csv(\"available_img_audios.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((15632, 2), (6700, 2))"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# from sklearn.model_selection import train_test_split\n",
"\n",
"# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)\n",
"# train_df = train_df.reset_index(drop=True)\n",
"# test_df = test_df.reset_index(drop=True)\n",
"# train_df.shape, test_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# train_df.to_csv(\"available_img_audios_TRAIN.csv\", index=False)\n",
"# test_df.to_csv(\"available_img_audios_TEST.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Audio Image Mapping for New Downloaded Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# import pandas as pd\n",
"import fireducks.pandas as pd\n",
"from tqdm import tqdm, trange\n",
"\n",
"HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n",
"IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\"\n",
"IMAGEAUDIOCSV = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def walkDIR(folder_path, include=None):\n",
" file_list = []\n",
" for root, _, files in os.walk(folder_path):\n",
" for file in files:\n",
" if include is None or any(file.endswith(ext) for ext in include):\n",
" file_list.append(os.path.join(root, file))\n",
" print(\"Files found:\", len(file_list))\n",
" return file_list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Files found: 38300\n"
]
},
{
"data": {
"text/plain": [
"(128807, 38300)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"images_files = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR) if i.endswith(\".jpg\")]\n",
"audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])\n",
"\n",
"len(images_files), len(audio_files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_TehriGarhwal-SPECIFIC_00863.jpg',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Budaun-SPECIFIC_00129.jpg']"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"images_files[:3]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"audio_files[:3]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"os.path.dirname(IMAGEDIR)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"IMAGEDIR, HINDI_AUDIO_DIR"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|\u001b[33m████████████████████████████████████████████████████████\u001b[0m| 38300/38300 [00:54<00:00, 704.27it/s]\u001b[0m\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" image_path | \n",
" audio_path | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 1 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 2 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 3 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 4 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 22327 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 22328 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 22329 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 22330 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 22331 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
"
\n",
"
22332 rows × 2 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"mapping_counter = 0\n",
"available_img_audios = {\"image_path\":[], \"audio_path\":[]}\n",
"\n",
"for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):\n",
" image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)\n",
" audio_path = os.path.join(HINDI_AUDIO_DIR, f\"{row.state}_{row.district}\", row.audio_path)\n",
" \n",
" if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):\n",
" available_img_audios['image_path'].append(image_path)\n",
" available_img_audios['audio_path'].append(audio_path)\n",
"\n",
"available_img_audios_df = pd.DataFrame(available_img_audios)\n",
"available_img_audios_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# available_img_audios_df.to_csv(\"available_img_audios.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((15632, 2), (6700, 2))"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# from sklearn.model_selection import train_test_split\n",
"\n",
"# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)\n",
"# train_df = train_df.reset_index(drop=True)\n",
"# test_df = test_df.reset_index(drop=True)\n",
"# train_df.shape, test_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# train_df.to_csv(\"available_img_audios_TRAIN.csv\", index=False)\n",
"# test_df.to_csv(\"available_img_audios_TEST.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Audio Image Mapping for All Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os, shutil\n",
"def copy_files_from_folders(name, source_folders, destination_folder):\n",
" r'''\n",
" Copies files from multiple source folders to a destination folder, \n",
" renaming them based on the source folder type.\n",
" Parameters:\n",
" name (str): The name to be appended to the destination folder.\n",
" source_folders (list): A list of paths to the source folders.\n",
" destination_folder (str): The path to the destination folder.\n",
" Returns:\n",
" None\n",
" The function creates a new folder inside the destination folder with \n",
" the given name. It then iterates through each file in the source \n",
" folders, renaming them based on the folder type ('story', 'highligits', or 'post') \n",
" and copying them to the destination folder. If a file with the same name \n",
" already exists in the destination folder, it is added to a list of duplicate \n",
" files, which is printed at the end along with the total number of files copied.\n",
" Example:\n",
" name = 'Folder1'\n",
" source_folders = srcdir\n",
" destination_folder = dstdir\n",
" copy_files_from_folders(name, source_folders, destination_folder)\n",
" '''\n",
" \n",
" destination_folder = os.path.join(destination_folder, name)\n",
" if not os.path.exists(destination_folder):\n",
" os.makedirs(destination_folder)\n",
" \n",
" total_files = 0\n",
" duplicate_files = []\n",
" \n",
" for source_folder in source_folders:\n",
" print(source_folder) \n",
" for file_name in tqdm(os.listdir(source_folder)):\n",
" \n",
" if 'story' in source_folder:\n",
" if 'highligits' in source_folder:\n",
" destination_file_name = ''.join(file_name.split('.')[:-1] + ['highligits']) + '.' + file_name.split('.')[-1] \n",
" else:\n",
" destination_file_name = ''.join(file_name.split('.')[:-1] + ['story']) + '.' + file_name.split('.')[-1]\n",
" else:\n",
" destination_file_name = ''.join(file_name.split('.')[:-1] + ['post']) + '.' + file_name.split('.')[-1]\n",
" \n",
" \n",
" source_file_path = os.path.join(source_folder, file_name)\n",
" destination_file_path = os.path.join(destination_folder, destination_file_name)\n",
" if os.path.isfile(source_file_path):\n",
" if os.path.isfile(destination_file_path):\n",
" duplicate_files.append(destination_file_path)\n",
" else:\n",
" shutil.copy(source_file_path, destination_file_path)\n",
" total_files += 1\n",
"\n",
" print(f'Total {total_files} files copies')\n",
" for i in duplicate_files:\n",
" print(i)\n",
" \n",
"\n",
"\n",
"name = 'Folder1'\n",
"source_folders = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR)]\n",
"destination_folder = r'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/'\n",
"copy_files_from_folders(name, source_folders, destination_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import subprocess\n",
"import os\n",
"from joblib import Parallel, delayed\n",
"\n",
"def merge_single_folder(src, target_folder, ignore_existing=False, dry_run=False):\n",
" \"\"\"\n",
" Merges contents of a single source folder into the target folder using rsync.\n",
" \"\"\"\n",
" if not os.path.exists(src):\n",
" print(f\"Source folder does not exist: {src}\")\n",
" return\n",
"\n",
" cmd = [\"rsync\", \"-ah\"]\n",
" \n",
" if ignore_existing:\n",
" cmd.append(\"--ignore-existing\")\n",
" if dry_run:\n",
" cmd.append(\"--dry-run\")\n",
"\n",
" cmd += [f\"{src.rstrip('/')}/\", target_folder]\n",
"\n",
" print(f\"Merging '{src}' into '{target_folder}'\")\n",
" subprocess.run(cmd, check=True)\n",
" print(f\"{len(os.listdir(target_folder))} files currently in target\")\n",
" print(\"-\" * 100)\n",
"\n",
"def merge_folders_with_rsync_parallel(source_folders, target_folder, ignore_existing=False, dry_run=False, n_jobs=-1):\n",
" \"\"\"\n",
" Parallel merge of multiple source folders into a target folder using rsync and joblib.\n",
" \"\"\"\n",
" if not os.path.exists(target_folder):\n",
" os.makedirs(target_folder)\n",
"\n",
" Parallel(n_jobs=n_jobs, backend=\"loky\")(\n",
" delayed(merge_single_folder)(src, target_folder, ignore_existing, dry_run)\n",
" for src in source_folders\n",
" )\n",
"\n",
"\n",
"IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/From-Images-Config/\"\n",
"source_dirs = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR)]\n",
"target_dir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3'\n",
"\n",
"merge_folders_with_rsync_parallel(source_dirs, target_dir, ignore_existing=False, dry_run=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# import pandas as pd\n",
"import fireducks.pandas as pd\n",
"from tqdm import tqdm, trange\n",
"\n",
"HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n",
"IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/\"\n",
"IMAGEAUDIOCSV = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" audio_path | \n",
" referenceImage | \n",
" gender | \n",
" state | \n",
" district | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7... | \n",
" Images/IISc_VaaniProject_GENERIC_0473.jpg | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 1 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4... | \n",
" Images/IISc_VaaniProject_GENERIC_1011.jpg | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 2 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6... | \n",
" Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015... | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 3 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0... | \n",
" Images/IISc_VaaniProject_GENERIC_0418.jpg | \n",
" Male | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 4 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2... | \n",
" Images/IISc_VaaniProject_GENERIC_0851.jpg | \n",
" Male | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 38295 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38296 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38297 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38298 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38299 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01327... | \n",
" Female | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
"
\n",
"
38300 rows x 5 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)\n",
"ImageAudioHindi_df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def walkDIR(folder_path, include=None):\n",
" file_list = []\n",
" for root, _, files in os.walk(folder_path):\n",
" for file in files:\n",
" if include is None or any(file.endswith(ext) for ext in include):\n",
" file_list.append(os.path.join(root, file))\n",
" print(\"Files found:\", len(file_list))\n",
" return file_list"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Files found: 284593\n",
"Files found: 73755\n"
]
},
{
"data": {
"text/plain": [
"(284593, 73755)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"images_files = walkDIR(IMAGEDIR, include=['.jpg', '.png'])\n",
"audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])\n",
"\n",
"len(images_files), len(audio_files)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_Churu-SPECIFIC_00422.jpg',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_LowerDibangvalley-SPECIFIC_01474.jpg',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_Khordha-SPECIFIC_02034.jpg']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"images_files[:3]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"audio_files[:3]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.path.dirname(IMAGEDIR)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images'"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.path.dirname(os.path.dirname(IMAGEDIR))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/',\n",
" '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"IMAGEDIR, HINDI_AUDIO_DIR"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|\u001b[33m████████████████████████████████████████████████████████\u001b[0m| 38300/38300 [00:46<00:00, 829.07it/s]\u001b[0m\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" image_path | \n",
" audio_path | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 1 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 2 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 3 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 4 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 38295 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 38296 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 38297 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 38298 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
" 38299 | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
" /scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan... | \n",
"
\n",
" \n",
"
\n",
"
38300 rows × 2 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"mapping_counter = 0\n",
"available_img_audios = {\"image_path\":[], \"audio_path\":[]}\n",
"\n",
"for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):\n",
" image_path = os.path.join(os.path.dirname(IMAGEDIR), os.path.basename(row.referenceImage))\n",
" audio_path = os.path.join(HINDI_AUDIO_DIR, f\"{row.state}_{row.district}\", row.audio_path)\n",
" \n",
" if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):\n",
" available_img_audios['image_path'].append(image_path)\n",
" available_img_audios['audio_path'].append(audio_path)\n",
"\n",
"available_img_audios_df = pd.DataFrame(available_img_audios)\n",
"available_img_audios_df"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%|\u001b[33m \u001b[0m| 0/38300 [00:00, ?it/s]\u001b[0m"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|\u001b[33m███████████████████████████████████████████████████████\u001b[0m| 38300/38300 [00:28<00:00, 1362.80it/s]\u001b[0m\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" image_path | \n",
" audio_path | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# mapping_counter = 0\n",
"# available_img_audios = {\"image_path\":[], \"audio_path\":[]}\n",
"\n",
"# for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):\n",
"# image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)\n",
"# audio_path = os.path.join(HINDI_AUDIO_DIR, f\"{row.state}_{row.district}\", row.audio_path)\n",
" \n",
"# if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):\n",
"# available_img_audios['image_path'].append(image_path)\n",
"# available_img_audios['audio_path'].append(audio_path)\n",
"\n",
"# available_img_audios_df = pd.DataFrame(available_img_audios)\n",
"# available_img_audios_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# available_img_audios_df.to_csv(\"available_img_audios.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((26810, 2), (11490, 2))"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# from sklearn.model_selection import train_test_split\n",
"\n",
"# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)\n",
"# train_df = train_df.reset_index(drop=True)\n",
"# test_df = test_df.reset_index(drop=True)\n",
"# train_df.shape, test_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# train_df.to_csv(\"available_img_audios_TRAIN2.csv\", index=False)\n",
"# test_df.to_csv(\"available_img_audios_TEST2.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Audio Image Mapping for All Images Polars MetaData"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# import pandas as pd\n",
"import fireducks.pandas as pd\n",
"import polars as pl\n",
"from tqdm import tqdm, trange\n",
"\n",
"HINDI_AUDIO_DIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi\"\n",
"IMAGEDIR = r\"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/\"\n",
"IMAGEAUDIOCSV = r\"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" audio_path | \n",
" referenceImage | \n",
" gender | \n",
" state | \n",
" district | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7... | \n",
" Images/IISc_VaaniProject_GENERIC_0473.jpg | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 1 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4... | \n",
" Images/IISc_VaaniProject_GENERIC_1011.jpg | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 2 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6... | \n",
" Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015... | \n",
" Female | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 3 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0... | \n",
" Images/IISc_VaaniProject_GENERIC_0418.jpg | \n",
" Male | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" 4 | \n",
" IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2... | \n",
" Images/IISc_VaaniProject_GENERIC_0851.jpg | \n",
" Male | \n",
" Delhi | \n",
" NewDelhi | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 38295 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38296 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38297 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38298 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405... | \n",
" Male | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
" 38299 | \n",
" IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844... | \n",
" Images/IISc_VaaniProject_Ranchi-SPECIFIC_01327... | \n",
" Female | \n",
" Jharkhand | \n",
" Ranchi | \n",
"
\n",
" \n",
"
\n",
"
38300 rows x 5 columns
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)\n",
"ImageAudioHindi_df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}