Realtime-whisper-large-v3-turbo

Running on Zero

File size: 1,610 Bytes

5d52c32
6c226f9
 
 
d790c0b
88183ad
1e8d252
6cd6646
6c226f9
17f14b2
f696e7e
6c226f9
 
 
 
 
f696e7e
6c226f9
 
 
5d52c32
3da85d4
1e8d252
 
 
 
 
6cd6646
 
 
1e8d252
6cd6646
1e8d252
 
 
 
 
 
 
 
 
 
3da85d4
 
3df1d51
46704ba
4731eae
1e8d252
 
3da85d4
3df1d51

import spaces
import torch
import gradio as gr
from transformers import pipeline
import tempfile
import os
import uuid
import scipy.io.wavfile

MODEL_NAME = "ylacombe/whisper-large-v3-turbo"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

@spaces.GPU
def transcribe(inputs, previous_transcription):
    try:
        # Generate a unique filename using UUID
        filename = f"{uuid.uuid4().hex}.wav"
        filepath = os.path.join(tempfile.gettempdir(), filename)

        # Extract sample rate and audio data from the tuple
        sample_rate, audio_data = inputs

        # Save the audio data to the temporary file
        scipy.io.wavfile.write(filepath, sample_rate, audio_data)

        previous_transcription += pipe(filepath, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]

        # Remove the temporary file after transcription
        os.remove(filepath)

        return previous_transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return previous_transcription  # Return the current transcription if an error occurs

with gr.Blocks() as demo:
    with gr.Column():
        input_audio_microphone = gr.Audio(streaming=True)
        output = gr.Textbox(label="Transcription", value="")

        input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output], time_limit=45, stream_every=2, concurrency_limit=None)

demo.queue().launch()