Spaces:
Running
Running
# import gradio as gr | |
# from transformers import pipeline | |
# import librosa | |
# import torch | |
# import numpy as np | |
# MODEL_NAME = 'Rezuwan/regional_asr_weights' | |
# #device = 0 if torch.cuda.is_available() else "cpu" | |
# BATCH_SIZE = 8 | |
# FILE_LIMIT_MB = 1000 | |
# transcriber = pipeline( | |
# task="automatic-speech-recognition", | |
# model=MODEL_NAME, | |
# chunk_length_s=30, | |
# #device=device, | |
# ) | |
# # Function to preprocess the audio and transcribe it | |
# def transcribe_audio(audio_path): | |
# if audio_path is None: | |
# return "No audio provided." | |
# try: | |
# # If audio is a tuple, it is from the microphone (Gradio input type) | |
# if isinstance(audio_path, tuple): | |
# sample_rate, audio_data = audio_path # Unpack the tuple (sample rate, numpy array) | |
# else: | |
# # If audio is a file, it will be a file path (for file uploads) | |
# audio_data, sample_rate = librosa.load(audio_path, sr=16000) # Load the audio file using librosa | |
# # Convert to mono-channel if necessary (if the audio is stereo) | |
# audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data | |
# audio_data = audio_data.astype(np.float32) | |
# audio_data /= np.max(np.abs(audio_data)) | |
# result = transcriber(audio_data) | |
# return result["text"] | |
# except Exception as e: | |
# return f"Error: {str(e)}" | |
# Create the Gradio interface for both file upload and microphone input | |
# iface = gr.Interface( | |
# fn=transcribe_audio, | |
# inputs=gr.Audio(type="filepath", label="Upload or Record Audio", interactive=True), # 'filepath' ensures file uploads provide a path for librosa to load | |
# outputs="text", | |
# title="Bengali Speech-to-Text with Regional Dialects", | |
# description=( | |
# f""" | |
# Model Card: [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and π€ Transformers to transcribe audio files of arbitrary length. [Do leave a like (β€οΈ) on the model card and this space] | |
# Instructions: | |
# 1. Click on 'Record' option in the left 'Upload or Record Audio' section and record the audio. | |
# 2. When done recording, click on 'Stop' button and give it some time until some waveform shows up in the 'Upload or Record Audio' section (Same goes when uploading pre-recorded audio files) and then click the 'Submit' button. | |
# 3. Wait for the audio clip to be processed (This could take a while π . Still needs work on the inference time) and then transcription of the audio will appear on the right 'output' section. | |
# 4. If want to submit a trimmed version of the input, select the trimmed audio snippet and then click 'Trim' and then wait a bit until wavform | |
# shows up in the input section of the interface and then click 'Submit'. | |
# Note: | |
# 1. Since the corpus used to fine-tune this model was really small, The orthography might still not be upto the mark but it gets the work done but still needs work and manual validation. | |
# 2.With proper data and a larger version of the corpus, I guess I'll be able to increase it's transcription performance of the Bengali speech with regional dialects. | |
#  | |
# """ | |
# ), | |
# allow_flagging="never", | |
# ) | |
# iface.launch(share=True) | |
import gradio as gr | |
from transformers import pipeline | |
import librosa | |
import numpy as np | |
MODEL_NAME = "Rezuwan/regional_asr_weights" | |
transcriber = pipeline("automatic-speech-recognition", model=MODEL_NAME) | |
def transcribe_audio(audio_path): | |
try: | |
audio_data, sample_rate = librosa.load(audio_path, sr=16000) | |
audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data | |
audio_data = audio_data.astype(np.float32) | |
audio_data /= np.max(np.abs(audio_data)) | |
result = transcriber(audio_data) | |
return result["text"] | |
except Exception as e: | |
return f"Error: {str(e)}" | |
iface = gr.Interface( | |
fn=transcribe_audio, | |
inputs=gr.Audio(type="filepath", label="Upload or Record Audio"), | |
outputs=gr.Textbox(label="Transcription"), | |
title="Bengali Speech-to-Text with Regional Dialects", | |
description=( | |
f""" | |
Model Card: [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and π€ Transformers to transcribe audio files of arbitrary length. [Do leave a like (β€οΈ) on the model card and this space] | |
Instructions: | |
1. Click on 'Record' option in the left 'Upload or Record Audio' section and record the audio. | |
2. When done recording, click on 'Stop' button and give it some time until some waveform shows up in the 'Upload or Record Audio' section (Same goes when uploading pre-recorded audio files) and then click the 'Submit' button. | |
3. Wait for the audio clip to be processed (This could take a while π . Still needs work on the inference time) and then transcription of the audio will appear on the right 'output' section. | |
4. If want to submit a trimmed version of the input, select the trimmed audio snippet and then click 'Trim' and then wait a bit until wavform | |
shows up in the input section of the interface and then click 'Submit'. | |
Note: | |
1. Since the corpus used to fine-tune this model was really small, The orthography might still not be upto the mark but it gets the work done but still needs work and manual validation. | |
2.With proper data and a larger version of the corpus, I guess I'll be able to increase it's transcription performance of the Bengali speech with regional dialects. | |
 | |
""" | |
) | |
) | |
iface.launch() |