Spaces:
Sleeping
Sleeping
import torch | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
import gradio as gr | |
import os | |
import time | |
import tempfile | |
from scipy.io.wavfile import write | |
from pydub import AudioSegment | |
# Configuration | |
MODEL_ID = "Crystalcareai/Whisper-Medicalv1" | |
CHUNK_LENGTH_S = 30 | |
RETURN_TIMESTAMPS = True | |
device = "cpu" | |
torch_dtype = torch.float32 | |
# Load model and processor once | |
print("Loading model...") | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
MODEL_ID, | |
torch_dtype=torch_dtype, | |
use_safetensors=True | |
) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(MODEL_ID) | |
# Build pipeline | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
torch_dtype=torch_dtype, | |
device=device, | |
chunk_length_s=CHUNK_LENGTH_S, | |
return_timestamps=RETURN_TIMESTAMPS | |
) | |
# split func | |
def split_audio(path, chunk_len_ms=30_000): | |
audio = AudioSegment.from_file(path) | |
return [audio[i:i+chunk_len_ms] for i in range(0, len(audio), chunk_len_ms)] | |
#transcribe chunkwise | |
def transcribe_chunkwise(audio, transcript_state): | |
sr, audio_data = audio | |
# Save whole file first | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: | |
write(tmp.name, sr, audio_data) | |
path = tmp.name | |
# Split audio into 30s chunks | |
audio_chunks = split_audio(path) | |
new_entries = [] | |
for i, chunk in enumerate(audio_chunks): | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ctmp: | |
chunk.export(ctmp.name, format="wav") | |
text = pipe(ctmp.name)["text"] | |
transcript_state.append(text) | |
yield "\n".join(transcript_state), transcript_state | |
#trasncribe func | |
def transcribe(audio): | |
if audio is None: | |
return "No audio input detected" | |
sr, audio_data = audio | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: | |
write(tmp.name, sr, audio_data) | |
tmp_path = tmp.name | |
text = pipe(tmp_path)["text"] | |
text = text.replace(". ", ".\n").replace("! ", "!\n").replace("? ", "?\n") | |
os.remove(tmp_path) | |
return text | |
with gr.Blocks() as app: | |
state = gr.State([]) | |
mic_input = gr.Audio(sources=["microphone"], label="ποΈ speak") | |
mic_run_btn = gr.Button("Trasncribe") | |
file_input = gr.Audio(sources=["upload"],label="π Upload Audio") | |
file_run_btn = gr.Button("Trasncribe") | |
output_textbox = gr.Textbox(label="Transcription"); | |
mic_run_btn.click(fn=transcribe_chunkwise, inputs=[mic_input, state], outputs=output_textbox, api_name="transcribe", show_progress=True) | |
file_run_btn.click(fn=transcribe_chunkwise, inputs=[file_input, state], outputs=output_textbox, api_name="transcribe", show_progress=True) | |
app.launch() | |