Rezuwan's picture
Update app.py
7a2e84b verified
# import gradio as gr
# from transformers import pipeline
# import librosa
# import torch
# import numpy as np
# MODEL_NAME = 'Rezuwan/regional_asr_weights'
# #device = 0 if torch.cuda.is_available() else "cpu"
# BATCH_SIZE = 8
# FILE_LIMIT_MB = 1000
# transcriber = pipeline(
# task="automatic-speech-recognition",
# model=MODEL_NAME,
# chunk_length_s=30,
# #device=device,
# )
# # Function to preprocess the audio and transcribe it
# def transcribe_audio(audio_path):
# if audio_path is None:
# return "No audio provided."
# try:
# # If audio is a tuple, it is from the microphone (Gradio input type)
# if isinstance(audio_path, tuple):
# sample_rate, audio_data = audio_path # Unpack the tuple (sample rate, numpy array)
# else:
# # If audio is a file, it will be a file path (for file uploads)
# audio_data, sample_rate = librosa.load(audio_path, sr=16000) # Load the audio file using librosa
# # Convert to mono-channel if necessary (if the audio is stereo)
# audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data
# audio_data = audio_data.astype(np.float32)
# audio_data /= np.max(np.abs(audio_data))
# result = transcriber(audio_data)
# return result["text"]
# except Exception as e:
# return f"Error: {str(e)}"
# Create the Gradio interface for both file upload and microphone input
# iface = gr.Interface(
# fn=transcribe_audio,
# inputs=gr.Audio(type="filepath", label="Upload or Record Audio", interactive=True), # 'filepath' ensures file uploads provide a path for librosa to load
# outputs="text",
# title="Bengali Speech-to-Text with Regional Dialects",
# description=(
# f"""
# Model Card: [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files of arbitrary length. [Do leave a like (❀️) on the model card and this space]
# Instructions:
# 1. Click on 'Record' option in the left 'Upload or Record Audio' section and record the audio.
# 2. When done recording, click on 'Stop' button and give it some time until some waveform shows up in the 'Upload or Record Audio' section (Same goes when uploading pre-recorded audio files) and then click the 'Submit' button.
# 3. Wait for the audio clip to be processed (This could take a while πŸ˜…. Still needs work on the inference time) and then transcription of the audio will appear on the right 'output' section.
# 4. If want to submit a trimmed version of the input, select the trimmed audio snippet and then click 'Trim' and then wait a bit until wavform
# shows up in the input section of the interface and then click 'Submit'.
# Note:
# 1. Since the corpus used to fine-tune this model was really small, The orthography might still not be upto the mark but it gets the work done but still needs work and manual validation.
# 2.With proper data and a larger version of the corpus, I guess I'll be able to increase it's transcription performance of the Bengali speech with regional dialects.
# ![](screenshot.jpg)
# """
# ),
# allow_flagging="never",
# )
# iface.launch(share=True)
import gradio as gr
from transformers import pipeline
import librosa
import numpy as np
MODEL_NAME = "Rezuwan/regional_asr_weights"
transcriber = pipeline("automatic-speech-recognition", model=MODEL_NAME)
def transcribe_audio(audio_path):
try:
audio_data, sample_rate = librosa.load(audio_path, sr=16000)
audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data
audio_data = audio_data.astype(np.float32)
audio_data /= np.max(np.abs(audio_data))
result = transcriber(audio_data)
return result["text"]
except Exception as e:
return f"Error: {str(e)}"
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath", label="Upload or Record Audio"),
outputs=gr.Textbox(label="Transcription"),
title="Bengali Speech-to-Text with Regional Dialects",
description=(
f"""
Model Card: [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files of arbitrary length. [Do leave a like (❀️) on the model card and this space]
Instructions:
1. Click on 'Record' option in the left 'Upload or Record Audio' section and record the audio.
2. When done recording, click on 'Stop' button and give it some time until some waveform shows up in the 'Upload or Record Audio' section (Same goes when uploading pre-recorded audio files) and then click the 'Submit' button.
3. Wait for the audio clip to be processed (This could take a while πŸ˜…. Still needs work on the inference time) and then transcription of the audio will appear on the right 'output' section.
4. If want to submit a trimmed version of the input, select the trimmed audio snippet and then click 'Trim' and then wait a bit until wavform
shows up in the input section of the interface and then click 'Submit'.
Note:
1. Since the corpus used to fine-tune this model was really small, The orthography might still not be upto the mark but it gets the work done but still needs work and manual validation.
2.With proper data and a larger version of the corpus, I guess I'll be able to increase it's transcription performance of the Bengali speech with regional dialects.
![](screenshot.jpg)
"""
)
)
iface.launch()