Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,46 +1,46 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from transformers import pipeline
|
3 |
-
import librosa
|
4 |
-
import torch
|
5 |
-
import numpy as np
|
6 |
-
|
7 |
-
MODEL_NAME = 'Rezuwan/regional_asr_weights'
|
8 |
-
#device = 0 if torch.cuda.is_available() else "cpu"
|
9 |
-
BATCH_SIZE = 8
|
10 |
-
FILE_LIMIT_MB = 1000
|
11 |
-
|
12 |
-
transcriber = pipeline(
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
)
|
18 |
-
|
19 |
-
# Function to preprocess the audio and transcribe it
|
20 |
-
def transcribe_audio(audio_path):
|
21 |
-
if audio_path is None:
|
22 |
-
return "No audio provided."
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
|
41 |
|
42 |
-
|
43 |
-
|
44 |
|
45 |
# Create the Gradio interface for both file upload and microphone input
|
46 |
# iface = gr.Interface(
|
@@ -75,37 +75,35 @@ def transcribe_audio(audio_path):
|
|
75 |
# )
|
76 |
|
77 |
|
78 |
-
iface = gr.Interface(
|
79 |
-
fn=transcribe_audio,
|
80 |
-
inputs=gr.Audio(type="filepath", label="Upload or Record Audio", interactive=True), # 'filepath' ensures file uploads provide a path for librosa to load),
|
81 |
-
outputs="text",
|
82 |
-
outputs=gr.Textbox(label="Transcription"),
|
83 |
-
title="Bengali Speech-to-Text with Regional Dialects",
|
84 |
-
description=(
|
85 |
-
f"""
|
86 |
-
Model Card: [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of arbitrary length. [Do leave a like (❤️) on the model card and this space]
|
87 |
-
|
88 |
-
Instructions:
|
89 |
-
|
90 |
-
1. Click on 'Record' option in the left 'Upload or Record Audio' section and record the audio.
|
91 |
-
2. When done recording, click on 'Stop' button and give it some time until some waveform shows up in the 'Upload or Record Audio' section (Same goes when uploading pre-recorded audio files) and then click the 'Submit' button.
|
92 |
-
3. Wait for the audio clip to be processed (This could take a while 😅. Still needs work on the inference time) and then transcription of the audio will appear on the right 'output' section.
|
93 |
-
4. If want to submit a trimmed version of the input, select the trimmed audio snippet and then click 'Trim' and then wait a bit until wavform
|
94 |
-
shows up in the input section of the interface and then click 'Submit'.
|
95 |
|
|
|
96 |
|
97 |
|
98 |
-
Note:
|
99 |
-
|
100 |
-
1. Since the corpus used to fine-tune this model was really small, The orthography might still not be upto the mark but it gets the work done but still needs work and manual validation.
|
101 |
-
|
102 |
-
2.With proper data and a larger version of the corpus, I guess I'll be able to increase it's transcription performance of the Bengali speech with regional dialects.
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
107 |
|
|
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
-
iface.launch(
|
|
|
1 |
+
# import gradio as gr
|
2 |
+
# from transformers import pipeline
|
3 |
+
# import librosa
|
4 |
+
# import torch
|
5 |
+
# import numpy as np
|
6 |
+
|
7 |
+
# MODEL_NAME = 'Rezuwan/regional_asr_weights'
|
8 |
+
# #device = 0 if torch.cuda.is_available() else "cpu"
|
9 |
+
# BATCH_SIZE = 8
|
10 |
+
# FILE_LIMIT_MB = 1000
|
11 |
+
|
12 |
+
# transcriber = pipeline(
|
13 |
+
# task="automatic-speech-recognition",
|
14 |
+
# model=MODEL_NAME,
|
15 |
+
# chunk_length_s=30,
|
16 |
+
# #device=device,
|
17 |
+
# )
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
# # Function to preprocess the audio and transcribe it
|
20 |
+
# def transcribe_audio(audio_path):
|
21 |
+
# if audio_path is None:
|
22 |
+
# return "No audio provided."
|
23 |
+
|
24 |
+
# try:
|
25 |
+
# # If audio is a tuple, it is from the microphone (Gradio input type)
|
26 |
+
# if isinstance(audio_path, tuple):
|
27 |
+
# sample_rate, audio_data = audio_path # Unpack the tuple (sample rate, numpy array)
|
28 |
+
# else:
|
29 |
+
# # If audio is a file, it will be a file path (for file uploads)
|
30 |
+
# audio_data, sample_rate = librosa.load(audio_path, sr=16000) # Load the audio file using librosa
|
31 |
|
32 |
|
33 |
|
34 |
+
# # Convert to mono-channel if necessary (if the audio is stereo)
|
35 |
+
# audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data
|
36 |
+
# audio_data = audio_data.astype(np.float32)
|
37 |
+
# audio_data /= np.max(np.abs(audio_data))
|
38 |
+
# result = transcriber(audio_data)
|
39 |
+
# return result["text"]
|
40 |
|
41 |
|
42 |
+
# except Exception as e:
|
43 |
+
# return f"Error: {str(e)}"
|
44 |
|
45 |
# Create the Gradio interface for both file upload and microphone input
|
46 |
# iface = gr.Interface(
|
|
|
75 |
# )
|
76 |
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
# iface.launch(share=True)
|
80 |
|
81 |
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
import gradio as gr
|
84 |
+
from transformers import pipeline
|
85 |
+
import librosa
|
86 |
+
import numpy as np
|
87 |
|
88 |
+
transcriber = pipeline("automatic-speech-recognition", model="Rezuwan/regional_asr_weights")
|
89 |
|
90 |
+
def transcribe_audio(audio_path):
|
91 |
+
try:
|
92 |
+
audio_data, sample_rate = librosa.load(audio_path, sr=16000)
|
93 |
+
audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data
|
94 |
+
audio_data = audio_data.astype(np.float32)
|
95 |
+
audio_data /= np.max(np.abs(audio_data))
|
96 |
+
result = transcriber(audio_data)
|
97 |
+
return result["text"]
|
98 |
+
except Exception as e:
|
99 |
+
return f"Error: {str(e)}"
|
100 |
|
101 |
+
iface = gr.Interface(
|
102 |
+
fn=transcribe_audio,
|
103 |
+
inputs=gr.Audio(type="filepath", label="Upload or Record Audio"),
|
104 |
+
outputs=gr.Textbox(label="Transcription"),
|
105 |
+
title="🗣️ Barishal ASR Speech-to-Text",
|
106 |
+
description="Upload or record an audio file to transcribe Bengali speech using the Barishal ASR model."
|
107 |
+
)
|
108 |
|
109 |
+
iface.launch()
|