# **Hugging Face Base Model Evaluation**

In [None]:
!pip install --upgrade pip
!pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio]
!pip install python-Levenshtein
!pip install jiwer

'''
accelerate - it is a Python library developed by Hugging Face, purpose is to speed up the training and inference process of deep learning models.

datasets - specifically designed for loading datasets from the Hugging Face Datasets Hub,
           which is a centralized repository of various datasets for natural language processing tasks.

python-Levenshtein - calculates the similarity percentage between two text strings using Levenshtein distance metric

jiwar - measure how well your system performs by compare predicted text output and reference text
'''

In [None]:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

'''
Transformers  : Access to large collection of pre-trained models for various NLP tasks

AutoModelForSpeechSeq2Seq - Automatically load any pre-trained speech-to-sequence model from Hugging Face's model hub

AutoProcessor - process of preparing your data for a specific pre-trained model by automatically choosing the right data processor

Pipeline - provides a simplified interface for using pre-trained models to perform various NLP tasks.
'''

In [None]:
import torch

'''
Torch - torch is a library that provides support for numerical operations using tensors, which are multi-dimensional arrays.
        It's a fundamental library for deep learning and scientific computing

Tensors - similar to arrays but come with additional features (like automatic differentiation) specifically designed for deep learning.
'''

In [None]:
import pandas as pd
import Levenshtein
import locale
import re
import os

'''
Levenshtein - calculates the similarity percentage between two text strings using Levenshtein distance metric

Levenshtein distance - measures minimum number of single-character edits (insert, delete, or substitute)
                       required to change one string into another

Locale - handle enabling adaptation to different languages and encodings,
         like date and time formats, currency symbols, and character encodings.

re - regular expressions help us to match the pattern based on requirements

pandas - simplifies data manipulation and analysis in Python

os - interact with the operating system, handling tasks like file operations and system commands.
'''

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
# connect your Colab with your Google Drive. This allows you to access files and data stored in your Google Drive directly from your Colab notebook.

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
'''
CPUs (Central Processing Units) - generally designed for serial processing, meaning they execute one instruction at a time.

GPUs (Graphics Processing Units)- have a massively parallel architecture with thousands of cores.
                                  This allows them to perform many calculations simultaneously,
                                  making them highly efficient for tasks that can be parallelized.

Deep learning involves a lot of matrix operations, such as matrix multiplications.
GPUs are optimized for these types of operations, and their architecture allows them to perform these calculations much faster than CPUs.
'''

## **OpenAI Whisper Large V3 Pre-trained Model**

In [None]:
# cuda - GPUs are often more efficient for certain types of computations (like deep learning), and using them can significantly speed up your code.

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
# float32 - floating-point number for more accuracy and memory to store number ; float16 - less accuracy and memory used in GPU

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
torch_dtype

torch.float16

In [None]:
# sets up and loads the "whisper-large-v3" model, and moves it to the specified computing device for further processing (like CPU or GPU)

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
print(model)
model.to(device)

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tr

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tr

In [None]:
#  automatically load the appropriate processor for 'openai/whisper-small' model

processor = AutoProcessor.from_pretrained(model_id)      # model_id = 'openai/whisper-large'

In [None]:
# create an Automatic Speech Recognition (ASR) pipeline for the Marathi audio-to-text process

pipe = pipeline(
                task = "automatic-speech-recognition",            # specifies the task for which you are creating the pipeline
                model = model,                                    # specifies the ASR model to be used
                tokenizer = processor.tokenizer,                  # tokenizer preparing raw audio data (continuous waveform) break into smaller chunks or frames that ASR model can process
                feature_extractor = processor.feature_extractor,  # important features extracted from raw audio for model to learn patterns and make predictions about the spoken words in audio
                max_new_tokens = 128,                             # limits the maximum no of tokens in generated text (1 word = 1 token), from 129 to all text are truncated (cut off)
                chunk_length_s = 30,                              # ASR system each chunks of audio to text by processing it in 30-second segments.
                batch_size = 16,                                  # number of audio chunks processed simultaneously in each batch
                return_timestamps = True,                         # Timestamps provide information about when each word in the transcription occurs in the audio
                torch_dtype = torch_dtype,                        # specifies the data type used by the PyTorch framework
                device = device                                   # specifies the device (GPU or CPU) on which the model will run --> 'cuda:0'
              )
pipe

<transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline at 0x7c8d23430eb0>

In [None]:
# locale module eneble to get the preferred encoding

locale.getpreferredencoding = lambda: "UTF-8"

# UTF-8 is a character encoding that supports a wide range of characters from various languages,
# making it a good choice for handling text data with different character sets.

In [None]:
# calculates the similarity percentage between two text strings using Levenshtein distance metric

def calculate_text_similarity_percentage(text1, text2):

    distance = Levenshtein.distance(text1, text2)                 # (hello, halo) ------> 'e' to 'a' ----> 'l' to remove ----> so 2 edits need 'hello' convert to 'halo'

    max_length = max(len(text1), len(text2))                      # hello=5; halo=4 -----> max = 5

    similarity_percentage = 100 - (distance / max_length * 100)   # 100 - (2/5 * 100) = 60

    return similarity_percentage                                  # so 60% similarity between 'hello' and 'halo'


In [None]:
# Remove specific special characters from the input text using regular expressions

def remove_special_characters(text):

    pattern = '[,\|\?\.\!\-\;\:\"\“\%\‘\”\�।]'        # defines a set of special characters to be removed.

    cleaned_text = re.sub(pattern, '', text).strip()  # Use re.sub to replace matched characters to empty string & strip() remove empty space in both ends of text

    return cleaned_text

**Sample audio check: 1**

In [None]:
text1 = 'अनेक रचना अभंग गवळणी असे स्फुट लेखन त्यांनी केले'                                   # He wrote many compositions that are unbreakable
text2 = remove_special_characters(' अनेक रचना, अभंग, गवलनी असे स्पूट लेकन त्याननी केली।')   # Many Rachna Abhang Gavalni spoot but she did it

similarity_percentage = calculate_text_similarity_percentage(text1, text2)
print(f'{similarity_percentage:.2f}')

85.42


**Sample audio check: 2**

In [None]:
text1 = 'या पानास लेखाचे स्वरूप यायला हवे'                               # This page should take the form of an article
text2 = remove_special_characters('या पानास लेखासे स्वरूप याईला हवे।')   # This page should be in article format

similarity_percentage = calculate_text_similarity_percentage(text1, text2)
print(f'{similarity_percentage:.2f}')

93.75


**Sample audio check: 3**

In [None]:
text1 = 'अनेक रचना अभंग गवळणी असे स्फुट लेखन त्यांनी केले'  # He wrote a lot of compositions, such as Abhang Gavalni
text2 = 'अनेक रचना अभंग गवळणी असे स्फुट लेखन त्यांनी केले'  # He wrote a lot of compositions, such as Abhang Gavalni

similarity_percentage = calculate_text_similarity_percentage(text1, text2)
print(f'{similarity_percentage:.2f}')

100.00


In [None]:
# uses the os library to list all the files in a specific directory

file_list = os.listdir('/content/drive/MyDrive/Voice_AI_Project/Audio')

print(file_list)

['common_voice_mr_30677278.wav', 'common_voice_mr_30728059.wav', 'common_voice_mr_30677264.wav', 'common_voice_mr_30709330.wav', 'common_voice_mr_30767401.wav', 'common_voice_mr_30709267.wav', 'common_voice_mr_30767350.wav', 'common_voice_mr_30666519.wav', 'common_voice_mr_30709028.wav', 'common_voice_mr_30705397.wav', 'common_voice_mr_30705227.wav', 'common_voice_mr_30705427.wav', 'common_voice_mr_30709469.wav', 'common_voice_mr_30716314.wav', 'common_voice_mr_30677352.wav', 'common_voice_mr_30677320.wav', 'common_voice_mr_30709500.wav', 'common_voice_mr_30724323.wav', 'common_voice_mr_30724320.wav', 'common_voice_mr_30667796.wav', 'common_voice_mr_30705143.wav', 'common_voice_mr_30767416.wav', 'common_voice_mr_30728129.wav', 'common_voice_mr_30724319.wav', 'common_voice_mr_30724317.wav', 'common_voice_mr_30705523.wav', 'common_voice_mr_30767410.wav', 'common_voice_mr_30709179.wav', 'common_voice_mr_30705098.wav', 'common_voice_mr_30709026.wav', 'common_voice_mr_30705224.wav', 'common

In [None]:
len(file_list)

833

In [None]:
# select the 500 audio samples in the evaluation process

file_list = file_list[0:500]

print(file_list)
len(file_list)

['common_voice_mr_30677278.wav', 'common_voice_mr_30728059.wav', 'common_voice_mr_30677264.wav', 'common_voice_mr_30709330.wav', 'common_voice_mr_30767401.wav', 'common_voice_mr_30709267.wav', 'common_voice_mr_30767350.wav', 'common_voice_mr_30666519.wav', 'common_voice_mr_30709028.wav', 'common_voice_mr_30705397.wav', 'common_voice_mr_30705227.wav', 'common_voice_mr_30705427.wav', 'common_voice_mr_30709469.wav', 'common_voice_mr_30716314.wav', 'common_voice_mr_30677352.wav', 'common_voice_mr_30677320.wav', 'common_voice_mr_30709500.wav', 'common_voice_mr_30724323.wav', 'common_voice_mr_30724320.wav', 'common_voice_mr_30667796.wav', 'common_voice_mr_30705143.wav', 'common_voice_mr_30767416.wav', 'common_voice_mr_30728129.wav', 'common_voice_mr_30724319.wav', 'common_voice_mr_30724317.wav', 'common_voice_mr_30705523.wav', 'common_voice_mr_30767410.wav', 'common_voice_mr_30709179.wav', 'common_voice_mr_30705098.wav', 'common_voice_mr_30709026.wav', 'common_voice_mr_30705224.wav', 'common

500

In [None]:
# Excel files have the details of audio_file_names and audio_text

df = pd.read_excel('/content/drive/MyDrive/Voice_AI_Project/trans.xlsx')  # read excel file using pandas library

f = df.set_index('audio_name')['audio_text'].to_dict()                    # Excel file convert to dictionary file

print(f)

{'common_voice_mr_32645351.wav': 'स्वेच्छानिवृतीचा पध्दत मोठ्या प्रमाणात अंमलात आल्यापासून या दुखण्याने अनेक घरांत प्रवेश केला आहे', 'common_voice_mr_32127661.wav': 'चार चार विषय आहेत', 'common_voice_mr_32127660.wav': 'क्रोष्टु सहस्रजित् नल अंतिक व लघु अशी त्याच्या पुत्रांची नावे आढळतात', 'common_voice_mr_32126825.wav': 'त्यातील एर्देनि हे रत् नाचे व सुबाशिदि हे सुभाषिताचे मंगोलियन भाषारूप होय', 'common_voice_mr_32126698.wav': 'आदरार्थी शब्दांनी केला जात असे लहानपणी गंगेजवळ असताना वसिष्ठांनी त्यांना सर्व वेद शिकविले होते', 'common_voice_mr_32121632.wav': 'खमंग वास येतोय', 'common_voice_mr_32121623.wav': 'चारुता सागर दिनकर दत्तात्रेय भोसले', 'common_voice_mr_32023928.wav': 'ही फेलोशिप मिळवणारे ते पहिले भारतीय होत', 'common_voice_mr_32022227.wav': 'तिचे सूत्रधार शंकरराव देव होते', 'common_voice_mr_31931304.wav': 'चला आज फिरायला जाऊयात', 'common_voice_mr_31928736.wav': 'मध्य भागातील पिवळ्या रंगावर वंदे मातरम् असे लिहिले', 'common_voice_mr_31917816.wav': 'थायलंडमध्ये रावणाचे शिल्प आढळते', 

In [None]:
# Evaluate audio sample for ASR predicted output text and reference text using metrics (WER & similarity_score)

import jiwer

evaluation = {'audio_name':[], 'predicted_text':[], 'reference_text':[], 'similarity_score':[], 'word_error_rate':[]}
c = 0

for i in file_list:

    audio_path = f"/content/drive/MyDrive/Voice_AI_Project/Audio/{i}"       # /content/drive/MyDrive/Voice_AI_Project/Audio/common_voice_mr_27591986.wav

    result = pipe(audio_path,                                               # Automatic Speech Recognition pipeline we pass audio file for speech-to-text process
                  return_timestamps=True,                                   # Timestamps provide information about when each word in the transcription occurs in the audio
                  generate_kwargs={"language": "marathi"})                  # specifies additional settings for ASR process. language of the input audio is Marathi.

    text = remove_special_characters(result['chunks'][0]['text'])           # remove the special charecter from text

    similarity_percentage = calculate_text_similarity_percentage(text, f[i])
    score = float(str(f'{similarity_percentage:.2f}'))

    wer = jiwer.wer(text, f[i])                                             # Calculate Word Error Rate (WER) between ASR predicted and reference text
    error = float(str(f'{wer:.2f}'))                                        # convert value with 2 decimal point only

    evaluation['audio_name'].append(i)
    evaluation['predicted_text'].append(text)
    evaluation['reference_text'].append(f[i])
    evaluation['similarity_score'].append(score)
    evaluation['word_error_rate'].append(error)

    print(f"Audio Samples Completed = {c}")
    c += 1

In [None]:
print(evaluation)

{'audio_name': ['common_voice_mr_30677278.wav', 'common_voice_mr_30728059.wav', 'common_voice_mr_30677264.wav', 'common_voice_mr_30709330.wav', 'common_voice_mr_30767401.wav', 'common_voice_mr_30709267.wav', 'common_voice_mr_30767350.wav', 'common_voice_mr_30666519.wav', 'common_voice_mr_30709028.wav', 'common_voice_mr_30705397.wav', 'common_voice_mr_30705227.wav', 'common_voice_mr_30705427.wav', 'common_voice_mr_30709469.wav', 'common_voice_mr_30716314.wav', 'common_voice_mr_30677352.wav', 'common_voice_mr_30677320.wav', 'common_voice_mr_30709500.wav', 'common_voice_mr_30724323.wav', 'common_voice_mr_30724320.wav', 'common_voice_mr_30667796.wav', 'common_voice_mr_30705143.wav', 'common_voice_mr_30767416.wav', 'common_voice_mr_30728129.wav', 'common_voice_mr_30724319.wav', 'common_voice_mr_30724317.wav', 'common_voice_mr_30705523.wav', 'common_voice_mr_30767410.wav', 'common_voice_mr_30709179.wav', 'common_voice_mr_30705098.wav', 'common_voice_mr_30709026.wav', 'common_voice_mr_3070522

In [None]:
df1 = pd.DataFrame(evaluation)
df1

Unnamed: 0,audio_name,predicted_text,reference_text,similarity_score,word_error_rate
0,common_voice_mr_30677278.wav,गम भणा विशय तपासत नहीं,गमभन विषय तपासत नाही,72.73,0.80
1,common_voice_mr_30728059.wav,त्या कार्कुनाने अंगोल केले परंतु स्वथायाचे दोत...,त्या कारकुनाने आंघोळ केली परंतु स्वतःचे धोतरही...,76.47,0.73
2,common_voice_mr_30677264.wav,नयसर्गेक सवरक्षन खारफुटीची वरिव प्रवाल पेटे नय...,नैसर्गिक संरक्षण खारफुटीची वने व प्रवाळ बेटे न...,69.91,0.86
3,common_voice_mr_30709330.wav,या पक्षाचा प्रभाव प्रामुख्याने उत्तर प्रदेशात आहे,या पक्षाचा प्रभाव प्रामुख्याने उत्तर प्रदेशात आहे,100.00,0.00
4,common_voice_mr_30767401.wav,अंगामी नागांचा देव उकपेनु अकाई तर आणचा देव पाश...,अंगामी नागांचा देव उकपेनुअकाई तर आअंचा देव पाष...,86.25,0.53
...,...,...,...,...,...
495,common_voice_mr_30625521.wav,प्राचीन कथा रामावर केंद्रित असुन सद्यह कालिन क...,प्राचीन कथा रामावर केंद्रित असून सद्यःकालीन कथ...,90.79,0.50
496,common_voice_mr_30477678.wav,ग्रीक संस्कृति तुलना करतात प्राचिन भारत चीन इर...,ग्रीक संस्कृती तुलना करता प्राचीन भारत चीन इरा...,85.71,0.58
497,common_voice_mr_30544778.wav,मग पान्याने कनीक आशा प्रकारे बिजवावी कि तजा तव...,मग पाण्याने कणीक अशा प्रकारे भिजवावी की त्याचा...,77.92,0.73
498,common_voice_mr_30633906.wav,भारतात इन राजकिय पक्ष समाजवदी पक्ष किवा तक्सम ...,भारतातील राजकीय पक्षसमाजवादी पक्ष किंवा तत्सम ...,85.88,0.71


In [None]:
print({'Average similarity_score': df1['similarity_score'].mean(), 'Standard deviation similarity_score': df1['similarity_score'].std()})

{'Average similarity_score': 78.6629, 'Standard deviation similarity_score': 10.63730525632467}


In [None]:
print({'Average word_error_rate' : df1['word_error_rate'].mean(), 'Standard deviation word_error_rate' : df1['word_error_rate'].std()})

{'Average word_error_rate': 0.74402, 'Standard deviation word_error_rate': 0.6038955366936214}


## **OpenAI Whisper Small Pre-trained Model**

In [None]:
# cuda - GPUs are often more efficient for certain types of computations (like deep learning), and using them can significantly speed up your code.

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
# float32 - floating-point number for more accuracy and memory to store number ; float16 - less accuracy and memory used in GPU

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
torch_dtype

torch.float16

In [None]:
# sets up and loads the "whisper-small" model, and moves it to the specified computing device for further processing (like CPU or GPU)

model_id = "openai/whisper-small"

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
print(model)
model.to(device)

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [None]:
#  automatically load the appropriate processor for 'openai/whisper-small' model

processor = AutoProcessor.from_pretrained(model_id)      # model_id = 'openai/whisper-small'
processor

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

WhisperProcessor:
- feature_extractor: WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: WhisperTokenizer(name_or_path='openai/whisper-small', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>', '<|he|>', '<|uk|>',

In [None]:
# create an Automatic Speech Recognition (ASR) pipeline for the Marathi audio-to-text process

pipe = pipeline(
                task = "automatic-speech-recognition",            # specifies the task for which you are creating the pipeline
                model = model,                                    # specifies the ASR model to be used
                tokenizer = processor.tokenizer,                  # tokenizer preparing raw audio data (continuous waveform) break into smaller chunks or frames that ASR model can process
                feature_extractor = processor.feature_extractor,  # important features extracted from raw audio for model to learn patterns and make predictions about the spoken words in audio
                max_new_tokens = 128,                             # limits the maximum no of tokens in generated text (1 word = 1 token), from 129 to all text are truncated (cut off)
                chunk_length_s = 30,                              # ASR system each chunks of audio to text by processing it in 30-second segments.
                batch_size = 16,                                  # number of audio chunks processed simultaneously in each batch
                return_timestamps = True,                         # Timestamps provide information about when each word in the transcription occurs in the audio
                torch_dtype = torch_dtype,                        # specifies the data type used by the PyTorch framework
                device = device                                   # specifies the device (GPU or CPU) on which the model will run --> 'cuda:0'
              )
pipe

<transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline at 0x7cbfef4b6470>

In [None]:
# locale module eneble to get the preferred encoding

locale.getpreferredencoding = lambda: "UTF-8"

# UTF-8 is a character encoding that supports a wide range of characters from various languages,
# making it a good choice for handling text data with different character sets.

In [None]:
# calculates the similarity percentage between two text strings using Levenshtein distance metric

def calculate_text_similarity_percentage(text1, text2):

    distance = Levenshtein.distance(text1, text2)                 # (hello, halo) ------> 'e' to 'a' ----> 'l' to remove ----> so 2 edits need 'hello' convert to 'halo'

    max_length = max(len(text1), len(text2))                      # hello=5; halo=4 -----> max = 5

    similarity_percentage = 100 - (distance / max_length * 100)   # 100 - (2/5 * 100) = 60

    return similarity_percentage                                  # so 60% similarity between 'hello' and 'halo'


In [None]:
# Remove specific special characters from the input text using regular expressions

def remove_special_characters(text):

    pattern = '[,\|\?\.\!\-\;\:\"\“\%\‘\”\�।]'        # defines a set of special characters to be removed.

    cleaned_text = re.sub(pattern, '', text).strip()  # Use re.sub to replace matched characters to empty string & strip() remove empty space in both ends of text

    return cleaned_text

**Sample audio check: 1**

In [None]:
text1 = 'अनेक रचना अभंग गवळणी असे स्फुट लेखन त्यांनी केले'                                   # He wrote many compositions that are unbreakable
text2 = remove_special_characters(' अनेक रचना, अभंग, गवलनी असे स्पूट लेकन त्याननी केली।')   # Many Rachna Abhang Gavalni spoot but she did it

similarity_percentage = calculate_text_similarity_percentage(text1, text2)
print(f'{similarity_percentage:.2f}')

85.42


**Sample audio check: 2**

In [None]:
text1 = 'या पानास लेखाचे स्वरूप यायला हवे'                               # This page should take the form of an article
text2 = remove_special_characters('या पानास लेखासे स्वरूप याईला हवे।')   # This page should be in article format

similarity_percentage = calculate_text_similarity_percentage(text1, text2)
print(f'{similarity_percentage:.2f}')

93.75


**Sample audio check: 3**

In [None]:
text1 = 'अनेक रचना अभंग गवळणी असे स्फुट लेखन त्यांनी केले'  # He wrote a lot of compositions, such as Abhang Gavalni
text2 = 'अनेक रचना अभंग गवळणी असे स्फुट लेखन त्यांनी केले'  # He wrote a lot of compositions, such as Abhang Gavalni

similarity_percentage = calculate_text_similarity_percentage(text1, text2)
print(f'{similarity_percentage:.2f}')

100.00


In [None]:
# uses the os library to list all the files in a specific directory

file_list = os.listdir('/content/drive/MyDrive/Voice_AI_Project/Audio')

print(file_list)

['common_voice_mr_27761932.wav', 'common_voice_mr_27592001.wav', 'common_voice_mr_27703232.wav', 'common_voice_mr_27762809.wav', 'common_voice_mr_27703254.wav', 'common_voice_mr_27761944.wav', 'common_voice_mr_27639584.wav', 'common_voice_mr_27761903.wav', 'common_voice_mr_27593311.wav', 'common_voice_mr_27762395.wav', 'common_voice_mr_27762369.wav', 'common_voice_mr_27761847.wav', 'common_voice_mr_27762912.wav', 'common_voice_mr_27703294.wav', 'common_voice_mr_27703242.wav', 'common_voice_mr_27703303.wav', 'common_voice_mr_27762881.wav', 'common_voice_mr_27593300.wav', 'common_voice_mr_27761963.wav', 'common_voice_mr_27593279.wav', 'common_voice_mr_27593240.wav', 'common_voice_mr_27762407.wav', 'common_voice_mr_27703282.wav', 'common_voice_mr_27593236.wav', 'common_voice_mr_27593322.wav', 'common_voice_mr_27762894.wav', 'common_voice_mr_27762511.wav', 'common_voice_mr_27762871.wav', 'common_voice_mr_27684807.wav', 'common_voice_mr_27591987.wav', 'common_voice_mr_27762501.wav', 'common

In [None]:
len(file_list)

833

In [None]:
# select the 500 audio samples in the evaluation process

file_list = file_list[0:500]

print(file_list)
len(file_list)

['common_voice_mr_27761932.wav', 'common_voice_mr_27592001.wav', 'common_voice_mr_27703232.wav', 'common_voice_mr_27762809.wav', 'common_voice_mr_27703254.wav', 'common_voice_mr_27761944.wav', 'common_voice_mr_27639584.wav', 'common_voice_mr_27761903.wav', 'common_voice_mr_27593311.wav', 'common_voice_mr_27762395.wav', 'common_voice_mr_27762369.wav', 'common_voice_mr_27761847.wav', 'common_voice_mr_27762912.wav', 'common_voice_mr_27703294.wav', 'common_voice_mr_27703242.wav', 'common_voice_mr_27703303.wav', 'common_voice_mr_27762881.wav', 'common_voice_mr_27593300.wav', 'common_voice_mr_27761963.wav', 'common_voice_mr_27593279.wav', 'common_voice_mr_27593240.wav', 'common_voice_mr_27762407.wav', 'common_voice_mr_27703282.wav', 'common_voice_mr_27593236.wav', 'common_voice_mr_27593322.wav', 'common_voice_mr_27762894.wav', 'common_voice_mr_27762511.wav', 'common_voice_mr_27762871.wav', 'common_voice_mr_27684807.wav', 'common_voice_mr_27591987.wav', 'common_voice_mr_27762501.wav', 'common

500

In [None]:
# Excel files have the details of audio_file_names and audio_text

df = pd.read_excel('/content/drive/MyDrive/Voice_AI_Project/trans.xlsx')  # read excel file using pandas library

f = df.set_index('audio_name')['audio_text'].to_dict()                    # Excel file convert to dictionary file

print(f)

{'common_voice_mr_32645351.wav': 'स्वेच्छानिवृतीचा पध्दत मोठ्या प्रमाणात अंमलात आल्यापासून या दुखण्याने अनेक घरांत प्रवेश केला आहे', 'common_voice_mr_32127661.wav': 'चार चार विषय आहेत', 'common_voice_mr_32127660.wav': 'क्रोष्टु सहस्रजित् नल अंतिक व लघु अशी त्याच्या पुत्रांची नावे आढळतात', 'common_voice_mr_32126825.wav': 'त्यातील एर्देनि हे रत् नाचे व सुबाशिदि हे सुभाषिताचे मंगोलियन भाषारूप होय', 'common_voice_mr_32126698.wav': 'आदरार्थी शब्दांनी केला जात असे लहानपणी गंगेजवळ असताना वसिष्ठांनी त्यांना सर्व वेद शिकविले होते', 'common_voice_mr_32121632.wav': 'खमंग वास येतोय', 'common_voice_mr_32121623.wav': 'चारुता सागर दिनकर दत्तात्रेय भोसले', 'common_voice_mr_32023928.wav': 'ही फेलोशिप मिळवणारे ते पहिले भारतीय होत', 'common_voice_mr_32022227.wav': 'तिचे सूत्रधार शंकरराव देव होते', 'common_voice_mr_31931304.wav': 'चला आज फिरायला जाऊयात', 'common_voice_mr_31928736.wav': 'मध्य भागातील पिवळ्या रंगावर वंदे मातरम् असे लिहिले', 'common_voice_mr_31917816.wav': 'थायलंडमध्ये रावणाचे शिल्प आढळते', 

In [None]:
# Evaluate audio sample for ASR predicted output text and reference text using metrics (WER & similarity_score)

import jiwer

evaluation = {'audio_name':[], 'predicted_text':[], 'reference_text':[], 'similarity_score':[], 'word_error_rate':[]}
c = 0

for i in file_list:

    audio_path = f"/content/drive/MyDrive/Voice_AI_Project/Audio/{i}"       # /content/drive/MyDrive/Voice_AI_Project/Audio/common_voice_mr_27591986.wav

    result = pipe(audio_path,                                               # Automatic Speech Recognition pipeline we pass audio file for speech-to-text process
                  return_timestamps=True,                                   # Timestamps provide information about when each word in the transcription occurs in the audio
                  generate_kwargs={"language": "marathi"})                  # specifies additional settings for ASR process. language of the input audio is Marathi.

    text = remove_special_characters(result['chunks'][0]['text'])           # remove the special charecter from text

    similarity_percentage = calculate_text_similarity_percentage(text, f[i])
    score = float(str(f'{similarity_percentage:.2f}'))

    wer = jiwer.wer(text, f[i])                                             # Calculate Word Error Rate (WER) between ASR predicted and reference text
    error = float(str(f'{wer:.2f}'))                                        # convert value with 2 decimal point only

    evaluation['audio_name'].append(i)
    evaluation['predicted_text'].append(text)
    evaluation['reference_text'].append(f[i])
    evaluation['similarity_score'].append(score)
    evaluation['word_error_rate'].append(error)

    print(f"Audio Samples Completed = {c}")
    c += 1

In [None]:
print(evaluation)

{'audio_name': ['common_voice_mr_27761932.wav', 'common_voice_mr_27592001.wav', 'common_voice_mr_27703232.wav', 'common_voice_mr_27762809.wav', 'common_voice_mr_27703254.wav', 'common_voice_mr_27761944.wav', 'common_voice_mr_27639584.wav', 'common_voice_mr_27761903.wav', 'common_voice_mr_27593311.wav', 'common_voice_mr_27762395.wav', 'common_voice_mr_27762369.wav', 'common_voice_mr_27761847.wav', 'common_voice_mr_27762912.wav', 'common_voice_mr_27703294.wav', 'common_voice_mr_27703242.wav', 'common_voice_mr_27703303.wav', 'common_voice_mr_27762881.wav', 'common_voice_mr_27593300.wav', 'common_voice_mr_27761963.wav', 'common_voice_mr_27593279.wav', 'common_voice_mr_27593240.wav', 'common_voice_mr_27762407.wav', 'common_voice_mr_27703282.wav', 'common_voice_mr_27593236.wav', 'common_voice_mr_27593322.wav', 'common_voice_mr_27762894.wav', 'common_voice_mr_27762511.wav', 'common_voice_mr_27762871.wav', 'common_voice_mr_27684807.wav', 'common_voice_mr_27591987.wav', 'common_voice_mr_2776250

In [None]:
df1 = pd.DataFrame(evaluation)
df1

Unnamed: 0,audio_name,predicted_text,reference_text,similarity_score,word_error_rate
0,common_voice_mr_27761932.wav,तान नतर ते तिर्ट मलूं दिलेजाते,त्याननतर ते तीर्थ म्हनून दिले जाते,70.59,1.00
1,common_voice_mr_27592001.wav,दूर्देवाने कही लोक रावनाचा प्रतिमे से दहन करतात,दुर्दैवाने कांही लोक रावणाच्या प्रतिमेचे दहन क...,82.00,0.62
2,common_voice_mr_27703232.wav,उनिकोड मनोगत्ध उनिकोड वाप्रुन तेर के लेले संके...,युनिकोड मनोगत हे युनिकोड वापरून तयार केलेले सं...,64.91,1.00
3,common_voice_mr_27762809.wav,नन्तर त्याज मदू ब्राम्णान कडे पातूं दिले,नंतर त्यास मधुब्राह्मणांकडे पाठवून दिलें,65.00,1.00
4,common_voice_mr_27703254.wav,निट सोरक्षन कर वो या चे नाव मच्चिन्दरनात फसे तेव,नीट संरक्षण कर व ह्याचे नाव मत्स्येंद्रनाथ असे...,66.00,0.80
...,...,...,...,...,...
495,common_voice_mr_30629891.wav,ༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀ...,तेलुगू भाषिक राज्याला मान्यता,0.00,4.00
496,common_voice_mr_30629759.wav,बहुडास लद्डा विष्नुस्पामी के सो कलादरी के तन स...,बहोरदास लढ्ढा विष्णुस्वामी केशव कलाधारी हे त्य...,10.97,1.00
497,common_voice_mr_30629756.wav,बूँस्तर शास्त्रे दूष्क्या करना तकाचे चार बागाचे,भूस्तरशास्त्रीयदृष्ट्या कर्नाटकाचे चार भाग आहेत,68.09,1.00
498,common_voice_mr_30629748.wav,ༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀༀ...,दळणवळणासाठी वाहने लागतात पण खुष्की वाहतुकीसाठी...,0.00,14.00


In [None]:
print({'Average similarity_score': df1['similarity_score'].mean(), 'Standard deviation similarity_score': df1['similarity_score'].std()})

{'Average similarity_score': 30.4756, 'Standard deviation similarity_score': 32.491292760530484}


In [None]:
print({'Average word_error_rate' : df1['word_error_rate'].mean(), 'Standard deviation word_error_rate' : df1['word_error_rate'].std()})

{'Average word_error_rate': 3.37574, 'Standard deviation word_error_rate': 4.064290364224664}
