# import gradio as gr # from transformers import pipeline # import librosa # import torch # import numpy as np # MODEL_NAME = 'Rezuwan/regional_asr_weights' # #device = 0 if torch.cuda.is_available() else "cpu" # BATCH_SIZE = 8 # FILE_LIMIT_MB = 1000 # transcriber = pipeline( # task="automatic-speech-recognition", # model=MODEL_NAME, # chunk_length_s=30, # #device=device, # ) # # Function to preprocess the audio and transcribe it # def transcribe_audio(audio_path): # if audio_path is None: # return "No audio provided." # try: # # If audio is a tuple, it is from the microphone (Gradio input type) # if isinstance(audio_path, tuple): # sample_rate, audio_data = audio_path # Unpack the tuple (sample rate, numpy array) # else: # # If audio is a file, it will be a file path (for file uploads) # audio_data, sample_rate = librosa.load(audio_path, sr=16000) # Load the audio file using librosa # # Convert to mono-channel if necessary (if the audio is stereo) # audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data # audio_data = audio_data.astype(np.float32) # audio_data /= np.max(np.abs(audio_data)) # result = transcriber(audio_data) # return result["text"] # except Exception as e: # return f"Error: {str(e)}" # Create the Gradio interface for both file upload and microphone input # iface = gr.Interface( # fn=transcribe_audio, # inputs=gr.Audio(type="filepath", label="Upload or Record Audio", interactive=True), # 'filepath' ensures file uploads provide a path for librosa to load # outputs="text", # title="Bengali Speech-to-Text with Regional Dialects", # description=( # f""" # Model Card: [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of arbitrary length. [Do leave a like (❤️) on the model card and this space] # Instructions: # 1. Click on 'Record' option in the left 'Upload or Record Audio' section and record the audio. # 2. When done recording, click on 'Stop' button and give it some time until some waveform shows up in the 'Upload or Record Audio' section (Same goes when uploading pre-recorded audio files) and then click the 'Submit' button. # 3. Wait for the audio clip to be processed (This could take a while 😅. Still needs work on the inference time) and then transcription of the audio will appear on the right 'output' section. # 4. If want to submit a trimmed version of the input, select the trimmed audio snippet and then click 'Trim' and then wait a bit until wavform # shows up in the input section of the interface and then click 'Submit'. # Note: # 1. Since the corpus used to fine-tune this model was really small, The orthography might still not be upto the mark but it gets the work done but still needs work and manual validation. # 2.With proper data and a larger version of the corpus, I guess I'll be able to increase it's transcription performance of the Bengali speech with regional dialects. # ![](screenshot.jpg) # """ # ), # allow_flagging="never", # ) # iface.launch(share=True) import gradio as gr from transformers import pipeline import librosa import numpy as np MODEL_NAME = "Rezuwan/regional_asr_weights" transcriber = pipeline("automatic-speech-recognition", model=MODEL_NAME) def transcribe_audio(audio_path): try: audio_data, sample_rate = librosa.load(audio_path, sr=16000) audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data audio_data = audio_data.astype(np.float32) audio_data /= np.max(np.abs(audio_data)) result = transcriber(audio_data) return result["text"] except Exception as e: return f"Error: {str(e)}" iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath", label="Upload or Record Audio"), outputs=gr.Textbox(label="Transcription"), title="Bengali Speech-to-Text with Regional Dialects", description=( f""" Model Card: [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of arbitrary length. [Do leave a like (❤️) on the model card and this space] Instructions: 1. Click on 'Record' option in the left 'Upload or Record Audio' section and record the audio. 2. When done recording, click on 'Stop' button and give it some time until some waveform shows up in the 'Upload or Record Audio' section (Same goes when uploading pre-recorded audio files) and then click the 'Submit' button. 3. Wait for the audio clip to be processed (This could take a while 😅. Still needs work on the inference time) and then transcription of the audio will appear on the right 'output' section. 4. If want to submit a trimmed version of the input, select the trimmed audio snippet and then click 'Trim' and then wait a bit until wavform shows up in the input section of the interface and then click 'Submit'. Note: 1. Since the corpus used to fine-tune this model was really small, The orthography might still not be upto the mark but it gets the work done but still needs work and manual validation. 2.With proper data and a larger version of the corpus, I guess I'll be able to increase it's transcription performance of the Bengali speech with regional dialects. ![](screenshot.jpg) """ ) ) iface.launch()