In [1]:
import os
import json
from pyspark.sql import SparkSession
import pandas as pd
import polars as pl
from tqdm.auto import tqdm, trange
from concurrent.futures import ThreadPoolExecutor, as_completed

SCRATCH = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani"
DATADIR = r"/home/IITB/ai-at-ieor/23m1521/datasets/Vaani"
JSON_PATH = os.path.join(DATADIR, "Vaani_IIsc_Artpark_Full_Data.json")
# IMAGES_PATH = os.path.join(SCRATCH, "Images")
IMAGES_PARQUETS = os.path.join(SCRATCH, "images_parquets")
AUDIO_URLS = "audio_urls.txt"
IMAGES_URLS = "images_urls.txt"
IMAGE_ROOT_URL = 'https://vaani.iisc.ac.in/'

In [2]:
data_dict = {'image_name': [], 'state': [], 'district': [], 'gender': [], 'audio_language': [], 'audio_name': []}

with open(JSON_PATH, 'r') as json_file:
    for i, line in tqdm(enumerate(json_file), total=9584932):
        # if i == 15: break
        line = json.loads(line.strip()[1:-1] if i == 0 else line.strip()[:-1])

        image_name = line['metadata']['imageFileName']
        image_path = os.path.join(SCRATCH, image_name)
        image_state = line['metadata']['state']
        image_district = line['metadata']['district']
        image_gender = line['metadata']['gender']
        audio_language = line['metadata']['languagesSpoken'][0]
        audio_name = line['metadata']['audioFileName']
        
        # print(image_name)
        # print(line)

        # if os.path.isfile(image_path):
        data_dict['image_name'].append(image_name)
        data_dict['state'].append(image_state)
        data_dict['district'].append(image_district)
        data_dict['gender'].append(image_gender)
        data_dict['audio_language'].append(audio_language)
        data_dict['audio_name'].append(audio_name)

        # print("-" * 100)

  0%|          | 0/9584932 [00:00<?, ?it/s]

In [3]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,image_name,state,district,gender,audio_language,audio_name
0,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
1,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
2,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
3,Images/IISc_VaaniProject_GENERIC_0073.jpg,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
4,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
...,...,...,...,...,...,...
9584927,Images/IISc_VaaniProject_GENERIC_0554.jpg,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584928,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584929,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584930,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...


In [5]:
df.image_name.unique().shape

(128807,)

///////////////////////////////////////////////////////////////////////

In [None]:
data_dict = {'image_name': [], 'state': [], 'district': [], 'gender': [], 'audio_language': [], 'audio_name': []}

with open(JSON_PATH, 'r') as json_file:
    for i, line in tqdm(enumerate(json_file), total=9584932):
        # if i == 15: break
        line = json.loads(line.strip()[1:-1] if i == 0 else line.strip()[:-1])

        image_name = line['metadata']['imageFileName']
        image_path = os.path.join(SCRATCH, image_name)
        image_state = line['metadata']['state']
        image_district = line['metadata']['district']
        image_gender = line['metadata']['gender']
        audio_language = line['metadata']['languagesSpoken'][0]
        audio_name = line['metadata']['audioFileName']
        
        # print(image_name)
        # print(line)

        if os.path.isfile(image_path):
            data_dict['image_name'].append(image_name)
            data_dict['state'].append(image_state)
            data_dict['district'].append(image_district)
            data_dict['gender'].append(image_gender)
            data_dict['audio_language'].append(audio_language)
            data_dict['audio_name'].append(audio_name)

        # print("-" * 100)

  0%|          | 0/9584932 [00:00<?, ?it/s]

In [3]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,image_name,state,district,gender,audio_language,audio_name
0,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
1,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
2,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
3,Images/IISc_VaaniProject_GENERIC_0073.jpg,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
4,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,Maharashtra,Aurangabad,female,Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...
...,...,...,...,...,...,...
9584927,Images/IISc_VaaniProject_GENERIC_0554.jpg,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584928,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584929,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...
9584930,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,Karnataka,Chamarajanagar,female,Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...


In [None]:
# df.to_parquet('Vaani-Images-Audio-MetaData.parquet', index=False)

<hr>
<hr>

# Final ImageBy Full Meta, Pending

In [2]:
import os
import json
# import pandas as pd
import fireducks.pandas as pd
from tqdm import tqdm, trange
import matplotlib.pyplot as plt

DATADIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/English"
IMAGEDIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images" 
FINAL_META = r"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/finalMETA.parquet"

In [3]:
finalMETA = pd.read_parquet(FINAL_META)
finalMETA

Unnamed: 0,id,file_name,file_url,assertLanguage,languagesSpoken,state,district,gender,audioFileName,imageFileName,pincode,speakerImageHash
0,2,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,Marathi,[Marathi],Maharashtra,Aurangabad,female,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,431105,(iq~v-nq +lTC]QXDCSnJ2~23=+|Nq~nn(
1,3,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,Marathi,[Marathi],Maharashtra,Aurangabad,female,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,431105,(iq~v-qq +lTC]QXDCSnJ2~23=+|Nq~miz
2,4,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,Marathi,[Marathi],Maharashtra,Aurangabad,female,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,431105,(iq~v.pl +lTC]QXDCSnJ2~23=+|Nq~lo{
3,5,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,Marathi,[Marathi],Maharashtra,Aurangabad,female,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,Images/IISc_VaaniProject_GENERIC_0073.jpg,431105,(iq~v+kl 1<0~A3:Aivx*
4,6,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,Marathi,[Marathi],Maharashtra,Aurangabad,female,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,Images/IISc_VaaniProject_Aurangabad-SPECIFIC_0...,431105,(iq~w+rk +lTC]QXDCSnJ2~23=+|Nq(knx
...,...,...,...,...,...,...,...,...,...,...,...,...
9584927,9696433,IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,Kannada,"[Kannada, English]",Karnataka,Chamarajanagar,female,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,Images/IISc_VaaniProject_GENERIC_0554.jpg,571440,"(oi~r)iivq )~=/I+|Nq,nm"
9584928,9696434,IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,Kannada,"[Kannada, English]",Karnataka,Chamarajanagar,female,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,571440,(oi{t(iivq |JPWiCL]K^CTs=G~|80@|Avq)mp
9584929,9696435,IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,Kannada,"[Kannada, English]",Karnataka,Chamarajanagar,female,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,571440,(kl}q-iivq |JPWiCL]K^CTs=G~|80@|Avr(kr
9584930,9696436,IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,Kannada,"[Kannada, English]",Karnataka,Chamarajanagar,female,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,Images/IISc_VaaniProject_Chamrajnagar-SPECIFIC...,571440,~rk}s+iivq |JPWiCL]K^CTs=G~|80@|Avq*km


In [3]:
finalMETA.state.unique()

array(['Maharashtra', 'Bihar', 'Chhattisgarh', 'Telangana', 'Jharkhand',
       'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'WestBengal',
       'Rajasthan', 'Uttarakhand', 'Goa'], dtype=object)

In [37]:
sorted(finalMETA.district.unique())

['Anantpur',
 'Araria',
 'Aurangabad',
 'Balrampur',
 'Bastar',
 'Begusarai',
 'Belgaum',
 'Bellary',
 'Bhagalpur',
 'Bijapur',
 'Bilaspur',
 'Budaun',
 'Chamarajanagar',
 'Chandrapur',
 'Chittoor',
 'Churu',
 'DakshinDinajpur',
 'DakshinaKannada',
 'Darbhanga',
 'Deoria',
 'Dharwad',
 'Dhule',
 'EastChamparan',
 'Etah',
 'Gaya',
 'Ghazipur',
 'Gopalganj',
 'Gorakhpur',
 'Gulbarga',
 'Guntur',
 'Hamirpur',
 'Jahanabad',
 'Jalaun',
 'Jalpaiguri',
 'Jamtara',
 'Jamui',
 'Jashpur',
 'Jhargram',
 'JyotibaPhuleNagar',
 'Kabirdham',
 'Karimnagar',
 'Kishanganj',
 'Kolkata',
 'Korba',
 'Krishna',
 'Lakhisarai',
 'Madhepura',
 'Malda',
 'Muzaffarnagar',
 'Muzaffarpur',
 'Mysore',
 'Nagaur',
 'Nagpur',
 'Nalgonda',
 'North24Parganas',
 'NorthSouthGoa',
 'PaschimMedinipur',
 'Pune',
 'Purnia',
 'Purulia',
 'Raichur',
 'Raigarh',
 'Rajnandgaon',
 'Saharsa',
 'Sahebganj',
 'Samastipur',
 'Saran',
 'Sarguja',
 'Shimoga',
 'Sindhudurg',
 'Sitamarhi',
 'Solapur',
 'Srikakulam',
 'Sukma',
 'Supaul',
 

In [3]:
len(os.listdir(IMAGEDIR))

128807

In [4]:
finalMETA.imageFileName.unique().shape

(128807,)

In [5]:
finalMETA.assertLanguage.unique().shape

(54,)

In [52]:
import csv
from tqdm import tqdm, trange


language_columns = sorted(finalMETA.assertLanguage.dropna().unique())
with open('imageBY3.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(
        ["id", "imageFileName", "audioCounts", "total",
         "state", "district", "assertLanguage", "audio_urls"] 
        + language_columns
    )

    total = 0
    write_rows_list = []

    pbar = tqdm(
        finalMETA.groupby('imageFileName'),
        colour="green", ncols=70,
        total=finalMETA['imageFileName'].nunique()
    )

    for i, (image, data) in enumerate(pbar):
        # if i == 3:
        #     break
        assert_Languages = data['assertLanguage'].unique()
        assert_Languages_url_dict = data.set_index('file_url')['assertLanguage'].to_dict()
        audio_urls_dict = {lang: [url for url, v in assert_Languages_url_dict.items() if v == lang] for lang in assert_Languages}
        
        # state = [i.strip() for i in data['state'].unique()]
        # district = [i.strip() for i in data['district'].unique()]
        state = data['state'].unique().tolist()
        district = data['district'].unique().tolist()
        
        audioCounts = data['assertLanguage'].count()
        total += audioCounts
        lang_counts = data['assertLanguage'].value_counts().to_dict()
        
        lang_row = [int(lang in lang_counts) for lang in language_columns]

        write_rows_list.append(
            [i + 1, image, audioCounts, total, state, district, lang_counts, audio_urls_dict] 
            + lang_row
        )

        if i % 1000 == 0 and i > 0:
            writer.writerows(write_rows_list)
            file.flush()
            write_rows_list = []
            
    if write_rows_list:
        writer.writerows(write_rows_list)

100%|[32m█████████████████████████[0m| 128807/128807 [29:41<00:00, 72.31it/s][0m


In [4]:
imageBY = pd.read_csv("/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/imageBY3.csv")

imageBY

Unnamed: 0,id,imageFileName,audioCounts,total,state,district,assertLanguage,audio_urls,Agariya,Angika,...,Sadri,Santali,Shekhawati,Surgujia,Surjapuri,Tamil,Telugu,Tulu,Urdu,Wagdi
0,1,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,52,52,['AndhraPradesh'],['Anantpur'],"{'Telugu': 45, 'Hindi': 7}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,41,93,['AndhraPradesh'],['Anantpur'],"{'Telugu': 35, 'Hindi': 6}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,53,146,['AndhraPradesh'],['Anantpur'],"{'Telugu': 45, 'Hindi': 6, 'Bengali': 2}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
3,4,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,38,184,['AndhraPradesh'],['Anantpur'],"{'Telugu': 32, 'Hindi': 5, 'Urdu': 1}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...,0,0,...,0,0,0,0,0,0,1,0,1,0
4,5,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,57,241,['AndhraPradesh'],['Anantpur'],"{'Telugu': 48, 'Hindi': 7, 'Urdu': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128802,128803,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,35,9584829,['AndhraPradesh'],['Vishakapattanam'],{'Telugu': 35},{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...,0,0,...,0,0,0,0,0,0,1,0,0,0
128803,128804,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,19,9584848,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 16, 'Hindi': 3}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...,0,0,...,0,0,0,0,0,0,1,0,0,0
128804,128805,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,17,9584865,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 15, 'Hindi': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...,0,0,...,0,0,0,0,0,0,1,0,0,0
128805,128806,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,31,9584896,['AndhraPradesh'],['Vishakapattanam'],{'Telugu': 31},{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...,0,0,...,0,0,0,0,0,0,1,0,0,0


In [6]:
imageBY.state.unique()

array(["['AndhraPradesh']", "['Bihar']", "['Maharashtra']",
       "['Chhattisgarh']", "['Karnataka']", "['UttarPradesh']",
       "['Rajasthan']", "['WestBengal']",
       "['Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Rajasthan', 'Bihar', 'WestBengal', 'Uttarakhand', 'Goa', 'Jharkhand']",
       "['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'WestBengal', 'Rajasthan', 'Jharkhand', 'Goa']",
       "['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Maharashtra', 'Goa', 'Rajasthan', 'WestBengal']",
       "['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'Rajasthan', 'WestBengal', 'Goa']",
       "['Bihar', 'Chhattisgarh', 'Telangana', 'AndhraPradesh', 'Maharashtra', 'Karnataka', 'UttarPradesh', 'Uttarakhand', 'Jharkhand', 'WestBenga

In [None]:
# state_len = set()
# district_len = set()
# for state, district in zip(imageBY.state.values, imageBY.district.values):
#     state = eval(state)
#     district = eval(district)
#     state_len.add(len(state))
#     district_len.add(len(district))
# state_len, district_len

({1}, {1})

In [None]:
# imageBY.state = imageBY.state.apply(lambda x: eval(x)[0])
# imageBY.district = imageBY.district.apply(lambda x: eval(x)[0])
# imageBY.to_csv("/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/imageBY3.csv", index=False)

In [7]:
imageBY.loc[imageBY.Hindi == 1]

Unnamed: 0,id,imageFileName,audioCounts,total,state,district,assertLanguage,audio_urls,Agariya,Angika,...,Sadri,Santali,Shekhawati,Surgujia,Surjapuri,Tamil,Telugu,Tulu,Urdu,Wagdi
0,1,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,52,52,['AndhraPradesh'],['Anantpur'],"{'Telugu': 45, 'Hindi': 7}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,41,93,['AndhraPradesh'],['Anantpur'],"{'Telugu': 35, 'Hindi': 6}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,53,146,['AndhraPradesh'],['Anantpur'],"{'Telugu': 45, 'Hindi': 6, 'Bengali': 2}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
3,4,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,38,184,['AndhraPradesh'],['Anantpur'],"{'Telugu': 32, 'Hindi': 5, 'Urdu': 1}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...,0,0,...,0,0,0,0,0,0,1,0,1,0
4,5,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,57,241,['AndhraPradesh'],['Anantpur'],"{'Telugu': 48, 'Hindi': 7, 'Urdu': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128798,128799,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,32,9584687,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 29, 'Hindi': 3}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...,0,0,...,0,0,0,0,0,0,1,0,0,0
128799,128800,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,35,9584722,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 33, 'Hindi': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...,0,0,...,0,0,0,0,0,0,1,0,0,0
128803,128804,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,19,9584848,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 16, 'Hindi': 3}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...,0,0,...,0,0,0,0,0,0,1,0,0,0
128804,128805,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,17,9584865,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 15, 'Hindi': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...,0,0,...,0,0,0,0,0,0,1,0,0,0


In [14]:
Hindi = 105940*100/128807
English = 468*100/128807
HindiNotEnglish = 105609*100/128807
Hindi, English, HindiNotEnglish

(82.24708284487645, 0.3633342908382308, 81.99010923319385)

In [8]:
HindiNotEnglish_df = imageBY.loc[(imageBY.English != 1) & (imageBY.Hindi == 1)].reset_index(drop=True)
HindiNotEnglish_df

Unnamed: 0,id,imageFileName,audioCounts,total,state,district,assertLanguage,audio_urls,Agariya,Angika,...,Sadri,Santali,Shekhawati,Surgujia,Surjapuri,Tamil,Telugu,Tulu,Urdu,Wagdi
0,1,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,52,52,['AndhraPradesh'],['Anantpur'],"{'Telugu': 45, 'Hindi': 7}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,41,93,['AndhraPradesh'],['Anantpur'],"{'Telugu': 35, 'Hindi': 6}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,53,146,['AndhraPradesh'],['Anantpur'],"{'Telugu': 45, 'Hindi': 6, 'Bengali': 2}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/A...,0,0,...,0,0,0,0,0,0,1,0,0,0
3,4,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,38,184,['AndhraPradesh'],['Anantpur'],"{'Telugu': 32, 'Hindi': 5, 'Urdu': 1}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...,0,0,...,0,0,0,0,0,0,1,0,1,0
4,5,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,57,241,['AndhraPradesh'],['Anantpur'],"{'Telugu': 48, 'Hindi': 7, 'Urdu': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/An...,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105604,128799,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,32,9584687,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 29, 'Hindi': 3}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...,0,0,...,0,0,0,0,0,0,1,0,0,0
105605,128800,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,35,9584722,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 33, 'Hindi': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...,0,0,...,0,0,0,0,0,0,1,0,0,0
105606,128804,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,19,9584848,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 16, 'Hindi': 3}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/V...,0,0,...,0,0,0,0,0,0,1,0,0,0
105607,128805,Images/IISc_VaaniProject_Vishakapattanam-SPECI...,17,9584865,['AndhraPradesh'],['Vishakapattanam'],"{'Telugu': 15, 'Hindi': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Vi...,0,0,...,0,0,0,0,0,0,1,0,0,0


In [None]:
Hindi_district = [
    'Delhi_NewDelhi', 'MadhyaPradesh_Bhopal', 'MadhyaPradesh_Katni',
    'Chhattisgarh_Bilaspur', 'Maharashtra_Nagpur', 'UttarPradesh_Varanasi', 
    'UttarPradesh_Lucknow', 'UttarPradesh_Gorakhpur'
]
Hindi_district = [i.split("_")[-1] for i in Hindi_district]
print(f"['{Hindi_district[1]}']")

for i, row in tqdm(HindiNotEnglish_df.iterrows(), colour='blue', total=HindiNotEnglish_df.shape[0], ncols=70):
    if i==1000:continue
    row_districts = eval(row['district'])
    print(Hindi_district[0], row_districts, Hindi_district[0] in row_districts)
    # if Hindi_district[0] in row_districts:
    #     print(row)
    #     break
    

In [39]:
JSON_PATH = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Vaani_IIsc_Artpark_Full_Data.json"

jsondf = pd.read_json(JSON_PATH)
jsondf

Unnamed: 0,id,file_name,file_url,metadata
0,2,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
1,3,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
2,4,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
3,5,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
4,6,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
...,...,...,...,...
9584927,9696433,IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi..."
9584928,9696434,IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi..."
9584929,9696435,IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi..."
9584930,9696436,IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi..."


In [None]:
metadata_cols = [
    'assertLanguage',
    'languagesSpoken',
    'state',
    'district',
    'gender',
    'audioFileName',
    'imageFileName',
    'pincode',
    'speakerImageHash',
    ]

meta_df = pd.json_normalize(jsondf['metadata'])
meta_df = meta_df[metadata_cols]
finalMETA = pd.concat([jsondf.drop(columns=['metadata']), meta_df], axis=1)
finalMETA

In [1]:
import os
os.listdir("/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/")

['Parquet',
 'English',
 'train-00000-of-00057.parquet',
 'NewDelhi_train-00000-of-00054.parquet']

In [3]:
parquet_df = pd.read_parquet("/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/NewDelhi_train-00000-of-00054.parquet")
parquet_df

Unnamed: 0,audio,language,languagesKnown,gender,state,district,pincode,stay(years),isTranscriptionAvailable,transcript,referenceImage
0,{'bytes': b'RIFF\xd2\x04\x01\x00WAVEfmt \x10\x...,Hindi,['Hindi'],Female,Delhi,NewDelhi,110004,NewDelhi(20),No,,Images/IISc_VaaniProject_GENERIC_0473.jpg
1,{'bytes': b'RIFF\xb0h\x01\x00WAVEfmt \x10\x00\...,Hindi,['Hindi'],Female,Delhi,NewDelhi,110001,NewDelhi(19),No,,Images/IISc_VaaniProject_GENERIC_1011.jpg
2,{'bytes': b'RIFF\x84+\x01\x00WAVEfmt \x10\x00\...,Hindi,['Hindi'],Female,Delhi,NewDelhi,110067,NewDelhi(11),No,,Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015...
3,{'bytes': b'RIFF2\xd3\x01\x00WAVEfmt \x10\x00\...,Hindi,['Hindi'],Male,Delhi,NewDelhi,110001,NewDelhi(24),Yes,ऐच_डी_ऐफ_सी बैंक {H_D_F_C bank} का और उसमे एक ...,Images/IISc_VaaniProject_GENERIC_0418.jpg
4,{'bytes': b'RIFF\xe45\x01\x00WAVEfmt \x10\x00\...,Hindi,['Hindi'],Male,Delhi,NewDelhi,110023,NewDelhi(20),No,,Images/IISc_VaaniProject_GENERIC_0851.jpg
...,...,...,...,...,...,...,...,...,...,...,...
3035,{'bytes': b'RIFF\xbch\x02\x00WAVEfmt \x10\x00\...,Hindi,['Hindi'],Female,Delhi,NewDelhi,110038,NewDelhi(20),No,,Images/IISc_VaaniProject_GENERIC_0193.jpg
3036,{'bytes': b'RIFFJD\x01\x00WAVEfmt \x10\x00\x00...,Hindi,['Hindi'],Female,Delhi,NewDelhi,110028,NewDelhi(20),No,,Images/IISc_VaaniProject_GENERIC_0249.jpg
3037,{'bytes': b'RIFF\x12\x02\x01\x00WAVEfmt \x10\x...,Hindi,['Hindi'],Female,Delhi,NewDelhi,110023,NewDelhi(20),No,,Images/IISc_VaaniProject_GENERIC_1268.jpg
3038,{'bytes': b'RIFFz\xbc\x01\x00WAVEfmt \x10\x00\...,Hindi,['Hindi'],Female,Delhi,NewDelhi,110011,NewDelhi(23),No,,Images/IISc_VaaniProject_NewDelhi-SPECIFIC_016...


In [9]:
HindiNotEnglish_df.imageFileName[0]

'Images/IISc_VaaniProject_Anantpur-SPECIFIC_00001.jpg'

In [10]:
finalMETA.loc[finalMETA.imageFileName == HindiNotEnglish_df.imageFileName[0]]

Unnamed: 0,id,file_name,file_url,assertLanguage,languagesSpoken,state,district,gender,audioFileName,imageFileName,pincode,speakerImageHash
135079,135081,IISc_VaaniProject_S_AP_Anantpur_113390_1167518...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Telugu,"[Telugu, Hindi]",AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515556,(jo}v(qk +eCPcZlTfB:<|+53:Aivq~j
135088,135090,IISc_VaaniProject_S_AP_Anantpur_113390_1167518...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Telugu,"[Telugu, Hindi]",AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515556,(jo}v(qk +eCPcZlTfB:<|+53:Aivq~j
408373,408532,IISc_VaaniProject_S_AP_Anantpur_94940_10873932...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Hindi,"[Hindi, Urdu]",AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515763,(iq}t0lk +eCPcZlTfB:<|+53:Aivq~j
792425,792662,IISc_VaaniProject_S_AP_Anantpur_112124_1158582...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Telugu,[Telugu],AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515672,(jn~v/km +eCPcZlTfB:<|+53:Aivq~j
792436,792673,IISc_VaaniProject_S_AP_Anantpur_112124_1158582...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Telugu,[Telugu],AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515672,(jn~v/km +eCPcZlTfB:<|+53:Aivq~j
1051890,1052127,IISc_VaaniProject_S_AP_Anantpur_107463_1139557...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Telugu,[Telugu],AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515134,"(jl(v,pq +eCPcZlTfB:<|+53:Aivq~j"
1051902,1052139,IISc_VaaniProject_S_AP_Anantpur_107463_1139557...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Telugu,[Telugu],AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515134,"(jl(v,pq +eCPcZlTfB:<|+53:Aivq~j"
1153317,1153554,IISc_VaaniProject_S_AP_Anantpur_104144_1121525...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Telugu,[Telugu],AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515812,(jkwv)nk +eCPcZlTfB:<|+53:Aivq~j
1153348,1153585,IISc_VaaniProject_S_AP_Anantpur_104144_1121525...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Telugu,[Telugu],AndhraPradesh,Anantpur,female,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515812,(jkwv)nk +eCPcZlTfB:<|+53:Aivq~j
1453323,1453709,IISc_VaaniProject_S_AP_Anantpur_121072_1220106...,https://vaani.iisc.ac.in/Audios/Anantpur/IISc_...,Hindi,"[English, Hindi]",AndhraPradesh,Anantpur,male,Audios/Anantpur/IISc_VaaniProject_S_AP_Anantpu...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_000...,515133,(kkvr~ok +eCPcZlTfB:<|+53:Aivq~j


In [11]:
imageBY.loc[(imageBY.English == 1) & (imageBY.Hindi == 1)]

Unnamed: 0,id,imageFileName,audioCounts,total,state,district,assertLanguage,audio_urls,Agariya,Angika,...,Sadri,Santali,Shekhawati,Surgujia,Surjapuri,Tamil,Telugu,Tulu,Urdu,Wagdi
10529,10530,Images/IISc_VaaniProject_Bellary-SPECIFIC_0023...,46,417891,['Karnataka'],['Bellary'],"{'Kannada': 40, 'Hindi': 2, 'Telugu': 2, 'Engl...",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Be...,0,0,...,0,0,0,0,0,0,1,0,0,0
10580,10581,Images/IISc_VaaniProject_Bellary-SPECIFIC_0028...,28,419513,['Karnataka'],['Bellary'],"{'Kannada': 17, 'Telugu': 7, 'Hindi': 2, 'Engl...",{'Kannada': ['https://vaani.iisc.ac.in/Audios/...,0,0,...,0,0,0,0,0,0,1,0,0,0
10855,10856,Images/IISc_VaaniProject_Bellary-SPECIFIC_0055...,40,428720,['Karnataka'],['Bellary'],"{'Kannada': 25, 'Telugu': 11, 'English': 2, 'H...",{'Kannada': ['https://vaani.iisc.ac.in/Audios/...,0,0,...,0,0,0,0,0,0,1,0,0,0
10862,10863,Images/IISc_VaaniProject_Bellary-SPECIFIC_0056...,37,428927,['Karnataka'],['Bellary'],"{'Kannada': 22, 'Telugu': 11, 'Hindi': 2, 'Eng...",{'Telugu': ['https://vaani.iisc.ac.in/Audios/B...,0,0,...,0,0,0,0,0,0,1,0,0,0
10942,10943,Images/IISc_VaaniProject_Bellary-SPECIFIC_0064...,39,431624,['Karnataka'],['Bellary'],"{'Kannada': 31, 'Telugu': 4, 'English': 2, 'Hi...",{'Kannada': ['https://vaani.iisc.ac.in/Audios/...,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70864,70865,Images/IISc_VaaniProject_Krishna-SPECIFIC_0029...,28,7336979,['AndhraPradesh'],['Krishna'],"{'Telugu': 21, 'Hindi': 4, 'English': 3}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr...,0,0,...,0,0,0,0,0,0,1,0,0,0
71153,71154,Images/IISc_VaaniProject_Krishna-SPECIFIC_0057...,34,7346921,['AndhraPradesh'],['Krishna'],"{'Telugu': 29, 'Hindi': 3, 'English': 2}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...,0,0,...,0,0,0,0,0,0,1,0,0,0
71161,71162,Images/IISc_VaaniProject_Krishna-SPECIFIC_0058...,39,7347205,['AndhraPradesh'],['Krishna'],"{'Telugu': 32, 'Hindi': 5, 'English': 2}",{'Hindi': ['https://vaani.iisc.ac.in/Audios/Kr...,0,0,...,0,0,0,0,0,0,1,0,0,0
71665,71666,Images/IISc_VaaniProject_Krishna-SPECIFIC_0109...,41,7364827,['AndhraPradesh'],['Krishna'],"{'Telugu': 36, 'English': 3, 'Hindi': 2}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...,0,0,...,0,0,0,0,0,0,1,0,0,0


In [12]:
imageBY.loc[imageBY.English == 1]

Unnamed: 0,id,imageFileName,audioCounts,total,state,district,assertLanguage,audio_urls,Agariya,Angika,...,Sadri,Santali,Shekhawati,Surgujia,Surjapuri,Tamil,Telugu,Tulu,Urdu,Wagdi
10309,10310,Images/IISc_VaaniProject_Bellary-SPECIFIC_0001...,41,410798,['Karnataka'],['Bellary'],"{'Kannada': 20, 'Telugu': 13, 'Urdu': 4, 'Bear...",{'Telugu': ['https://vaani.iisc.ac.in/Audios/B...,0,0,...,0,0,0,0,0,0,1,0,1,0
10322,10323,Images/IISc_VaaniProject_Bellary-SPECIFIC_0002...,33,411256,['Karnataka'],['Bellary'],"{'Kannada': 25, 'Telugu': 5, 'English': 2, 'Be...",{'Kannada': ['https://vaani.iisc.ac.in/Audios/...,0,0,...,0,0,0,0,0,0,1,0,0,0
10341,10342,Images/IISc_VaaniProject_Bellary-SPECIFIC_0004...,40,411887,['Karnataka'],['Bellary'],"{'Kannada': 33, 'Telugu': 5, 'English': 2}",{'Kannada': ['https://vaani.iisc.ac.in/Audios/...,0,0,...,0,0,0,0,0,0,1,0,0,0
10394,10395,Images/IISc_VaaniProject_Bellary-SPECIFIC_0009...,34,413709,['Karnataka'],['Bellary'],"{'Kannada': 28, 'Telugu': 4, 'English': 2}",{'Kannada': ['https://vaani.iisc.ac.in/Audios/...,0,0,...,0,0,0,0,0,0,1,0,0,0
10508,10509,Images/IISc_VaaniProject_Bellary-SPECIFIC_0020...,42,417224,['Karnataka'],['Bellary'],"{'Kannada': 31, 'Telugu': 9, 'English': 2}",{'Kannada': ['https://vaani.iisc.ac.in/Audios/...,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71275,71276,Images/IISc_VaaniProject_Krishna-SPECIFIC_0070...,28,7351129,['AndhraPradesh'],['Krishna'],"{'Telugu': 25, 'English': 3}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...,0,0,...,0,0,0,0,0,0,1,0,0,0
71372,71373,Images/IISc_VaaniProject_Krishna-SPECIFIC_0080...,31,7354457,['AndhraPradesh'],['Krishna'],"{'Telugu': 29, 'English': 2}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...,0,0,...,0,0,0,0,0,0,1,0,0,0
71650,71651,Images/IISc_VaaniProject_Krishna-SPECIFIC_0107...,34,7364295,['AndhraPradesh'],['Krishna'],"{'Telugu': 32, 'English': 2}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...,0,0,...,0,0,0,0,0,0,1,0,0,0
71665,71666,Images/IISc_VaaniProject_Krishna-SPECIFIC_0109...,41,7364827,['AndhraPradesh'],['Krishna'],"{'Telugu': 36, 'English': 3, 'Hindi': 2}",{'Telugu': ['https://vaani.iisc.ac.in/Audios/K...,0,0,...,0,0,0,0,0,0,1,0,0,0


In [13]:
imageBY.assertLanguage

0                       {'Telugu': 45, 'Hindi': 7}
1                       {'Telugu': 35, 'Hindi': 6}
2         {'Telugu': 45, 'Hindi': 6, 'Bengali': 2}
3            {'Telugu': 32, 'Hindi': 5, 'Urdu': 1}
4            {'Telugu': 48, 'Hindi': 7, 'Urdu': 2}
                            ...                   
128802                              {'Telugu': 35}
128803                  {'Telugu': 16, 'Hindi': 3}
128804                  {'Telugu': 15, 'Hindi': 2}
128805                              {'Telugu': 31}
128806                  {'Telugu': 35, 'Hindi': 1}
Name: assertLanguage, Length: 128807, dtype: object

In [8]:
# imageBY_data = {'imageFileName': [], 'state': [], 'district': [],}

# Image - Audio(Hindi) for CSIP

Original JSON is not updated with Delhi

In [14]:
import os
import pandas as pd
# import fireducks.pandas as pd
from tqdm import tqdm, trange

HINDI_AUDIO_DIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi"
IMAGEDIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images"
JSON_PATH = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Vaani_IIsc_Artpark_Full_Data.json"

In [15]:
jsondf = pd.read_json(JSON_PATH)
jsondf

Unnamed: 0,id,file_name,file_url,metadata
0,2,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
1,3,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
2,4,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
3,5,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
4,6,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '..."
...,...,...,...,...
9584927,9696433,IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi..."
9584928,9696434,IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi..."
9584929,9696435,IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi..."
9584930,9696436,IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi..."


In [16]:
for i in tqdm(jsondf.file_name.values):
    # if 'Delhi' in i:
    if i == 'IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav':
        print(i)

100%|██████████| 9584932/9584932 [00:02<00:00, 4578604.24it/s]


In [3]:
fullJSON = pd.concat([jsondf, pd.json_normalize(jsondf.metadata)], axis=1)
fullJSON

Unnamed: 0,id,file_name,file_url,metadata,assertLanguage,audioFileName,audioManualQC,audioQualityCheck,district,gender,...,imageFaceData.face_90.facial_area,imageFaceData.face_91.facial_area,imageFaceData.face_92.facial_area,imageFaceData.face_93.facial_area,imageFaceData.face_94.facial_area,imageFaceData.face_95.facial_area,imageFaceData.face_96.facial_area,imageFaceData.face_97.facial_area,imageFaceData.face_98.facial_area,imageFaceData.face_99.facial_area
0,2,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated,Aurangabad,female,...,,,,,,,,,,
1,3,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated,Aurangabad,female,...,,,,,,,,,,
2,4,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated,Aurangabad,female,...,,,,,,,,,,
3,5,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated,Aurangabad,female,...,,,,,,,,,,
4,6,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'state': 'Maharashtra', 'gender': 'female', '...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated&Manual,Aurangabad,female,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9584927,9696433,IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi...",Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,,Automated,Chamarajanagar,female,...,,,,,,,,,,
9584928,9696434,IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi...",Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,,Automated&Manual,Chamarajanagar,female,...,,,,,,,,,,
9584929,9696435,IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi...",Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,,Automated,Chamarajanagar,female,...,,,,,,,,,,
9584930,9696436,IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'state': 'Karnataka', 'gender': 'female', 'pi...",Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,,Automated,Chamarajanagar,female,...,,,,,,,,,,


In [None]:
# fullJSON.to_parquet("Vaani-Images-Audio-JSON.parquet", index=False)

In [2]:
fullJSON = pd.read_parquet("Vaani-Images-Audio-JSON.parquet")
fullJSON

Unnamed: 0,id,file_name,file_url,metadata,assertLanguage,audioFileName,audioManualQC,audioQualityCheck,district,gender,...,imageFaceData.face_90.facial_area,imageFaceData.face_91.facial_area,imageFaceData.face_92.facial_area,imageFaceData.face_93.facial_area,imageFaceData.face_94.facial_area,imageFaceData.face_95.facial_area,imageFaceData.face_96.facial_area,imageFaceData.face_97.facial_area,imageFaceData.face_98.facial_area,imageFaceData.face_99.facial_area
0,2,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'assertLanguage': 'Marathi', 'audioFileName':...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated,Aurangabad,female,...,,,,,,,,,,
1,3,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'assertLanguage': 'Marathi', 'audioFileName':...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated,Aurangabad,female,...,,,,,,,,,,
2,4,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'assertLanguage': 'Marathi', 'audioFileName':...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated,Aurangabad,female,...,,,,,,,,,,
3,5,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'assertLanguage': 'Marathi', 'audioFileName':...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated,Aurangabad,female,...,,,,,,,,,,
4,6,IISc_VaaniProject_S_Maharashtra_Aurangabad_952...,https://vaani.iisc.ac.in/Audios/Aurangabad/IIS...,"{'assertLanguage': 'Marathi', 'audioFileName':...",Marathi,Audios/Aurangabad/IISc_VaaniProject_S_Maharash...,,Automated&Manual,Aurangabad,female,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9584927,9696433,IISc_VaaniProject_M_KA_Chamrajn_42017276_16081...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'assertLanguage': 'Kannada', 'audioFileName':...",Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,,Automated,Chamarajanagar,female,...,,,,,,,,,,
9584928,9696434,IISc_VaaniProject_M_KA_Chamrajn_42017276_16053...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'assertLanguage': 'Kannada', 'audioFileName':...",Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,,Automated&Manual,Chamarajanagar,female,...,,,,,,,,,,
9584929,9696435,IISc_VaaniProject_M_KA_Chamrajn_42017276_12370...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'assertLanguage': 'Kannada', 'audioFileName':...",Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,,Automated,Chamarajanagar,female,...,,,,,,,,,,
9584930,9696436,IISc_VaaniProject_M_KA_Chamrajn_42017276_09272...,https://vaani.iisc.ac.in/Audios/Chamrajn/IISc_...,"{'assertLanguage': 'Kannada', 'audioFileName':...",Kannada,Audios/Chamrajn/IISc_VaaniProject_M_KA_Chamraj...,,Automated,Chamarajanagar,female,...,,,,,,,,,,


In [13]:
for i in tqdm(fullJSON.file_name.values):
    # if 'Delhi' in i:
    if i == 'IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav':
        print(i)

  0%|          | 0/9584932 [00:00<?, ?it/s]

100%|██████████| 9584932/9584932 [00:01<00:00, 6650531.75it/s]


In [11]:
'Delhi' in '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav'

True

In [4]:
fullJSON.loc[fullJSON.state == 'Delhi']

Unnamed: 0,id,file_name,file_url,metadata,assertLanguage,audioFileName,audioManualQC,audioQualityCheck,district,gender,...,imageFaceData.face_90.facial_area,imageFaceData.face_91.facial_area,imageFaceData.face_92.facial_area,imageFaceData.face_93.facial_area,imageFaceData.face_94.facial_area,imageFaceData.face_95.facial_area,imageFaceData.face_96.facial_area,imageFaceData.face_97.facial_area,imageFaceData.face_98.facial_area,imageFaceData.face_99.facial_area


In [16]:
fullJSON.iloc[0,:].values

array([np.int64(2),
       'IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',
       'https://vaani.iisc.ac.in/Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',
       {'assertLanguage': 'Marathi', 'audioFileName': 'Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav', 'audioManualQC': None, 'audioQualityCheck': 'Automated', 'district': 'Aurangabad', 'gender': 'female', 'imageFaceData': None, 'imageFileName': 'Images/IISc_VaaniProject_Aurangabad-SPECIFIC_00559.jpg', 'languagesSpoken': array(['Marathi'], dtype=object), 'pincode': '431105', 'speakerImageHash': '(iq~v-nq +lTC]QXDCSnJ2~23=+|Nq~nn(', 'state': 'Maharashtra', 'stay(years)': 'Aurangabad(23)', 'transcript': None, 'transcriptQualityCheck': None},
       'Marathi',
       'Audios/Aurangabad/IISc_VaaniProject_S_Maharashtra_Aurangabad_95220_10885658_MRACO_32387_0_10237.wav',
       np.float64(nan), '

In [None]:
ImageAudioHindi_df = fullJSON[['state', 'district', 'gender', 'assertLanguage', 'file_name', 'imageFileName']]
ImageAudioHindi_df = ImageAudioHindi_df.sort_values(by=['state', 'district', 'assertLanguage'], 
                                                    ascending=[True, True, True])
ImageAudioHindi_df = ImageAudioHindi_df.reset_index(drop=True)
# ImageAudioHindi_df.to_csv("Image-Audio-Hindi.csv", index=False)
ImageAudioHindi_df

Unnamed: 0,state,district,gender,assertLanguage,file_name,imageFileName
0,AndhraPradesh,Anantpur,female,Bengali,IISc_VaaniProject_S_AP_Anantpur_100778_1106180...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_009...
1,AndhraPradesh,Anantpur,female,Bengali,IISc_VaaniProject_S_AP_Anantpur_100778_1106122...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_001...
2,AndhraPradesh,Anantpur,female,Bengali,IISc_VaaniProject_S_AP_Anantpur_100778_1106132...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_007...
3,AndhraPradesh,Anantpur,female,Bengali,IISc_VaaniProject_S_AP_Anantpur_100778_1106123...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_010...
4,AndhraPradesh,Anantpur,female,Bengali,IISc_VaaniProject_S_AP_Anantpur_100778_1106123...,Images/IISc_VaaniProject_Anantpur-SPECIFIC_005...
...,...,...,...,...,...,...
9584927,WestBengal,Purulia,male,Santali,IISc_VaaniProject_M_WB_Purulia_Guru45176_03235...,Images/IISc_VaaniProject_Purulia-SPECIFIC_0165...
9584928,WestBengal,Purulia,male,Santali,IISc_VaaniProject_M_WB_Purulia_Guru45176_03313...,Images/IISc_VaaniProject_GENERIC_0839.jpg
9584929,WestBengal,Purulia,male,Santali,IISc_VaaniProject_M_WB_Purulia_Guru45176_03292...,Images/IISc_VaaniProject_GENERIC_0022.jpg
9584930,WestBengal,Purulia,male,Santali,IISc_VaaniProject_M_WB_Purulia_Guru45176_03050...,Images/IISc_VaaniProject_Purulia-SPECIFIC_0162...


# Image - Audio(Hindi) Dataloaders for CSIP

In [50]:
import os
# import pandas as pd
import fireducks.pandas as pd
from tqdm import tqdm, trange

HINDI_AUDIO_DIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi"
IMAGEDIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images"
IMAGEAUDIOCSV = r"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv"

In [51]:
def walkDIR(folder_path, include=None):
    file_list = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if include is None or any(file.endswith(ext) for ext in include):
                file_list.append(os.path.join(root, file))
    print("Files found:", len(file_list))
    return file_list

In [52]:
images_files = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR) if i.endswith(".jpg")]
audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])

len(images_files), len(audio_files)

Files found: 38300


(128807, 38300)

In [53]:
images_files[:3]

['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_TehriGarhwal-SPECIFIC_00863.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Budaun-SPECIFIC_00129.jpg']

In [54]:
audio_files[:3]

['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']

In [55]:
os.path.dirname(IMAGEDIR)

'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani'

In [56]:
ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)
ImageAudioHindi_df

Unnamed: 0,audio_path,referenceImage,gender,state,district
0,IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7...,Images/IISc_VaaniProject_GENERIC_0473.jpg,Female,Delhi,NewDelhi
1,IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4...,Images/IISc_VaaniProject_GENERIC_1011.jpg,Female,Delhi,NewDelhi
2,IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6...,Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015...,Female,Delhi,NewDelhi
3,IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0...,Images/IISc_VaaniProject_GENERIC_0418.jpg,Male,Delhi,NewDelhi
4,IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2...,Images/IISc_VaaniProject_GENERIC_0851.jpg,Male,Delhi,NewDelhi
...,...,...,...,...,...
38295,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594...,Male,Jharkhand,Ranchi
38296,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700...,Male,Jharkhand,Ranchi
38297,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941...,Male,Jharkhand,Ranchi
38298,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405...,Male,Jharkhand,Ranchi


Audio Image Mapping

In [11]:
IMAGEDIR, HINDI_AUDIO_DIR

('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')

In [None]:
mapping_counter = 0
available_img_audios = {"image_path":[], "audio_path":[]}

for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):
    image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)
    audio_path = os.path.join(HINDI_AUDIO_DIR, f"{row.state}_{row.district}", row.audio_path)
        
    if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):
        available_img_audios['image_path'].append(image_path)
        available_img_audios['audio_path'].append(audio_path)

available_img_audios_df = pd.DataFrame(available_img_audios)
available_img_audios_df

100%|[33m████████████████████████████████████████████████████████[0m| 38300/38300 [00:54<00:00, 704.27it/s][0m


Unnamed: 0,image_path,audio_path
0,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
1,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
2,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
3,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
4,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
...,...,...
22327,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22328,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22329,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22330,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...


In [None]:
# available_img_audios_df.to_csv("available_img_audios.csv", index=False)

In [None]:
# from sklearn.model_selection import train_test_split

# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)
# train_df = train_df.reset_index(drop=True)
# test_df = test_df.reset_index(drop=True)
# train_df.shape, test_df.shape

((15632, 2), (6700, 2))

In [None]:
# train_df.to_csv("available_img_audios_TRAIN.csv", index=False)
# test_df.to_csv("available_img_audios_TEST.csv", index=False)

Audio Image Mapping for New Downloaded Images

In [None]:
import os
# import pandas as pd
import fireducks.pandas as pd
from tqdm import tqdm, trange

HINDI_AUDIO_DIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi"
IMAGEDIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images"
IMAGEAUDIOCSV = r"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv"

In [None]:
def walkDIR(folder_path, include=None):
    file_list = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if include is None or any(file.endswith(ext) for ext in include):
                file_list.append(os.path.join(root, file))
    print("Files found:", len(file_list))
    return file_list

In [None]:
images_files = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR) if i.endswith(".jpg")]
audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])

len(images_files), len(audio_files)

Files found: 38300


(128807, 38300)

In [None]:
images_files[:3]

['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Saran-SPECIFIC_00486.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_TehriGarhwal-SPECIFIC_00863.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/IISc_VaaniProject_Budaun-SPECIFIC_00129.jpg']

In [None]:
audio_files[:3]

['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']

In [None]:
os.path.dirname(IMAGEDIR)

'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani'

In [None]:
IMAGEDIR, HINDI_AUDIO_DIR

('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')

In [None]:
mapping_counter = 0
available_img_audios = {"image_path":[], "audio_path":[]}

for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):
    image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)
    audio_path = os.path.join(HINDI_AUDIO_DIR, f"{row.state}_{row.district}", row.audio_path)
        
    if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):
        available_img_audios['image_path'].append(image_path)
        available_img_audios['audio_path'].append(audio_path)

available_img_audios_df = pd.DataFrame(available_img_audios)
available_img_audios_df

100%|[33m████████████████████████████████████████████████████████[0m| 38300/38300 [00:54<00:00, 704.27it/s][0m


Unnamed: 0,image_path,audio_path
0,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
1,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
2,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
3,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
4,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
...,...,...
22327,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22328,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22329,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
22330,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...


In [None]:
# available_img_audios_df.to_csv("available_img_audios.csv", index=False)

In [None]:
# from sklearn.model_selection import train_test_split

# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)
# train_df = train_df.reset_index(drop=True)
# test_df = test_df.reset_index(drop=True)
# train_df.shape, test_df.shape

((15632, 2), (6700, 2))

In [None]:
# train_df.to_csv("available_img_audios_TRAIN.csv", index=False)
# test_df.to_csv("available_img_audios_TEST.csv", index=False)

Audio Image Mapping for All Images

In [None]:
import os, shutil
def copy_files_from_folders(name, source_folders, destination_folder):
    r'''
    Copies files from multiple source folders to a destination folder, 
    renaming them based on the source folder type.
    Parameters:
        name (str): The name to be appended to the destination folder.
        source_folders (list): A list of paths to the source folders.
        destination_folder (str): The path to the destination folder.
    Returns:
        None
    The function creates a new folder inside the destination folder with 
    the given name. It then iterates through each file in the source 
    folders, renaming them based on the folder type ('story', 'highligits', or 'post') 
    and copying them to the destination folder. If a file with the same name 
    already exists in the destination folder, it is added to a list of duplicate 
    files, which is printed at the end along with the total number of files copied.
    Example:
        name = 'Folder1'
        source_folders = srcdir
        destination_folder = dstdir
        copy_files_from_folders(name, source_folders, destination_folder)
    '''
  
    destination_folder = os.path.join(destination_folder, name)
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        
    total_files = 0
    duplicate_files = []
    
    for source_folder in source_folders:
        print(source_folder)            
        for file_name in tqdm(os.listdir(source_folder)):
            
            if 'story' in source_folder:
                if 'highligits' in source_folder:
                    destination_file_name = ''.join(file_name.split('.')[:-1] + ['highligits']) + '.' + file_name.split('.')[-1]            
                else:
                    destination_file_name = ''.join(file_name.split('.')[:-1] + ['story']) + '.' + file_name.split('.')[-1]
            else:
                destination_file_name = ''.join(file_name.split('.')[:-1] + ['post']) + '.' + file_name.split('.')[-1]
              
                
            source_file_path = os.path.join(source_folder, file_name)
            destination_file_path = os.path.join(destination_folder, destination_file_name)
            if os.path.isfile(source_file_path):
                if os.path.isfile(destination_file_path):
                    duplicate_files.append(destination_file_path)
                else:
                    shutil.copy(source_file_path, destination_file_path)
                    total_files += 1

    print(f'Total {total_files} files copies')
    for i in duplicate_files:
        print(i)
        


name = 'Folder1'
source_folders = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR)]
destination_folder = r'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/'
copy_files_from_folders(name, source_folders, destination_folder)

In [None]:
import subprocess
import os
from joblib import Parallel, delayed

def merge_single_folder(src, target_folder, ignore_existing=False, dry_run=False):
    """
    Merges contents of a single source folder into the target folder using rsync.
    """
    if not os.path.exists(src):
        print(f"Source folder does not exist: {src}")
        return

    cmd = ["rsync", "-ah"]
    
    if ignore_existing:
        cmd.append("--ignore-existing")
    if dry_run:
        cmd.append("--dry-run")

    cmd += [f"{src.rstrip('/')}/", target_folder]

    print(f"Merging '{src}' into '{target_folder}'")
    subprocess.run(cmd, check=True)
    print(f"{len(os.listdir(target_folder))} files currently in target")
    print("-" * 100)

def merge_folders_with_rsync_parallel(source_folders, target_folder, ignore_existing=False, dry_run=False, n_jobs=-1):
    """
    Parallel merge of multiple source folders into a target folder using rsync and joblib.
    """
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    Parallel(n_jobs=n_jobs, backend="loky")(
        delayed(merge_single_folder)(src, target_folder, ignore_existing, dry_run)
        for src in source_folders
    )


IMAGEDIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/From-Images-Config/"
source_dirs = [os.path.join(IMAGEDIR, i) for i in os.listdir(IMAGEDIR)]
target_dir = '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3'

merge_folders_with_rsync_parallel(source_dirs, target_dir, ignore_existing=False, dry_run=False)

In [8]:
import os
# import pandas as pd
import fireducks.pandas as pd
from tqdm import tqdm, trange

HINDI_AUDIO_DIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi"
IMAGEDIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/"
IMAGEAUDIOCSV = r"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv"

In [5]:
ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)
ImageAudioHindi_df

Unnamed: 0,audio_path,referenceImage,gender,state,district
0,IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7...,Images/IISc_VaaniProject_GENERIC_0473.jpg,Female,Delhi,NewDelhi
1,IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4...,Images/IISc_VaaniProject_GENERIC_1011.jpg,Female,Delhi,NewDelhi
2,IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6...,Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015...,Female,Delhi,NewDelhi
3,IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0...,Images/IISc_VaaniProject_GENERIC_0418.jpg,Male,Delhi,NewDelhi
4,IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2...,Images/IISc_VaaniProject_GENERIC_0851.jpg,Male,Delhi,NewDelhi
...,...,...,...,...,...
38295,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594...,Male,Jharkhand,Ranchi
38296,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700...,Male,Jharkhand,Ranchi
38297,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941...,Male,Jharkhand,Ranchi
38298,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405...,Male,Jharkhand,Ranchi


In [6]:
def walkDIR(folder_path, include=None):
    file_list = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if include is None or any(file.endswith(ext) for ext in include):
                file_list.append(os.path.join(root, file))
    print("Files found:", len(file_list))
    return file_list

In [9]:
images_files = walkDIR(IMAGEDIR, include=['.jpg', '.png'])
audio_files = walkDIR(HINDI_AUDIO_DIR, include=['.wav'])

len(images_files), len(audio_files)

Files found: 284593
Files found: 73755


(284593, 73755)

In [10]:
images_files[:3]

['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_Churu-SPECIFIC_00422.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_LowerDibangvalley-SPECIFIC_01474.jpg',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/IISc_VaaniProject_Khordha-SPECIFIC_02034.jpg']

In [11]:
audio_files[:3]

['/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Sani20169_20169062008000070453_NewDelhi-SPECIFIC_00634_453_4576.wav',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Ishi43015_43015203103000098276_NewDelhi-SPECIFIC_00021_9344_14805.wav',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi/Delhi_NewDelhi/IISc_VaaniProject_M_Delhi_NewDelhi_Radi56078_56078170942000098763_GENERIC_1166_182_3703.wav']

In [12]:
os.path.dirname(IMAGEDIR)

'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3'

In [14]:
os.path.dirname(os.path.dirname(IMAGEDIR))

'/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images'

In [13]:
IMAGEDIR, HINDI_AUDIO_DIR

('/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/',
 '/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi')

In [16]:
mapping_counter = 0
available_img_audios = {"image_path":[], "audio_path":[]}

for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):
    image_path = os.path.join(os.path.dirname(IMAGEDIR), os.path.basename(row.referenceImage))
    audio_path = os.path.join(HINDI_AUDIO_DIR, f"{row.state}_{row.district}", row.audio_path)
        
    if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):
        available_img_audios['image_path'].append(image_path)
        available_img_audios['audio_path'].append(audio_path)

available_img_audios_df = pd.DataFrame(available_img_audios)
available_img_audios_df

100%|[33m████████████████████████████████████████████████████████[0m| 38300/38300 [00:46<00:00, 829.07it/s][0m


Unnamed: 0,image_path,audio_path
0,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
1,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
2,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
3,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
4,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
...,...,...
38295,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
38296,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
38297,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...
38298,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...,/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaan...


In [24]:
# mapping_counter = 0
# available_img_audios = {"image_path":[], "audio_path":[]}

# for i, row in tqdm(ImageAudioHindi_df.iterrows(), ncols=100, total=ImageAudioHindi_df.shape[0], colour='YELLOW'):
#     image_path = os.path.join(os.path.dirname(IMAGEDIR), row.referenceImage)
#     audio_path = os.path.join(HINDI_AUDIO_DIR, f"{row.state}_{row.district}", row.audio_path)
        
#     if all([os.path.isfile(audio_path), os.path.isfile(image_path)]):
#         available_img_audios['image_path'].append(image_path)
#         available_img_audios['audio_path'].append(audio_path)

# available_img_audios_df = pd.DataFrame(available_img_audios)
# available_img_audios_df

  0%|[33m                                                                     [0m| 0/38300 [00:00<?, ?it/s][0m

100%|[33m███████████████████████████████████████████████████████[0m| 38300/38300 [00:28<00:00, 1362.80it/s][0m


Unnamed: 0,image_path,audio_path


In [None]:
# available_img_audios_df.to_csv("available_img_audios.csv", index=False)

In [17]:
# from sklearn.model_selection import train_test_split

# train_df, test_df = train_test_split(available_img_audios_df, test_size=0.3, shuffle=True, random_state=42)
# train_df = train_df.reset_index(drop=True)
# test_df = test_df.reset_index(drop=True)
# train_df.shape, test_df.shape

((26810, 2), (11490, 2))

In [None]:
# train_df.to_csv("available_img_audios_TRAIN2.csv", index=False)
# test_df.to_csv("available_img_audios_TEST2.csv", index=False)

Audio Image Mapping for All Images Polars MetaData

In [None]:
import os
# import pandas as pd
import fireducks.pandas as pd
import polars as pl
from tqdm import tqdm, trange

HINDI_AUDIO_DIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Audios/Hindi"
IMAGEDIR = r"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images/Folder3/"
IMAGEAUDIOCSV = r"/home/IITB/ai-at-ieor/23m1521/ashish/MTP/Vaani/Vaani-Audio-Image-Hindi3.csv"


In [None]:
ImageAudioHindi_df = pd.read_csv(IMAGEAUDIOCSV)
ImageAudioHindi_df

Unnamed: 0,audio_path,referenceImage,gender,state,district
0,IISc_VaaniProject_M_Delhi_NewDelhi_Zoya76543_7...,Images/IISc_VaaniProject_GENERIC_0473.jpg,Female,Delhi,NewDelhi
1,IISc_VaaniProject_M_Delhi_NewDelhi_Kaja46663_4...,Images/IISc_VaaniProject_GENERIC_1011.jpg,Female,Delhi,NewDelhi
2,IISc_VaaniProject_M_Delhi_NewDelhi_Kris66646_6...,Images/IISc_VaaniProject_NewDelhi-SPECIFIC_015...,Female,Delhi,NewDelhi
3,IISc_VaaniProject_M_Delhi_NewDelhi_Abuz00012_0...,Images/IISc_VaaniProject_GENERIC_0418.jpg,Male,Delhi,NewDelhi
4,IISc_VaaniProject_M_Delhi_NewDelhi_Adah26256_2...,Images/IISc_VaaniProject_GENERIC_0851.jpg,Male,Delhi,NewDelhi
...,...,...,...,...,...
38295,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_00594...,Male,Jharkhand,Ranchi
38296,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_00700...,Male,Jharkhand,Ranchi
38297,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_01941...,Male,Jharkhand,Ranchi
38298,IISc_VaaniProject_K_Jharkhand_Ranchi_Ranchi844...,Images/IISc_VaaniProject_Ranchi-SPECIFIC_01405...,Male,Jharkhand,Ranchi
