File size: 1,768 Bytes
5d52c32
6c226f9
 
 
d790c0b
88183ad
1e8d252
6cd6646
8c4d38d
6c226f9
17f14b2
f696e7e
6c226f9
 
 
 
 
f696e7e
6c226f9
 
 
5d52c32
3da85d4
1e8d252
6cd6646
 
2bf1d0a
 
 
 
 
 
 
 
 
 
 
1e8d252
 
 
 
15b00fb
3da85d4
 
3df1d51
72be79d
46704ba
4731eae
1e8d252
15b00fb
3da85d4
3df1d51
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import spaces
import torch
import gradio as gr
from transformers import pipeline
import tempfile
import os
import uuid
import scipy.io.wavfile
import numpy as np

MODEL_NAME = "ylacombe/whisper-large-v3-turbo"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

@spaces.GPU
def transcribe(inputs, previous_transcription):
    try:
        sample_rate, audio_data = inputs

        # Convert audio data to a NumPy array of floats normalized between -1 and 1
        audio_data = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

        # Perform transcription
        transcription = pipe(audio_data, 
                             batch_size=BATCH_SIZE, 
                             generate_kwargs={"task": "transcribe"}, 
                             return_timestamps=True)

        # Append new transcription to previous transcription
        previous_transcription += transcription["text"]

        return previous_transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return previous_transcription  

with gr.Blocks() as demo:
    with gr.Column():
        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: Transcribe Audio\n Transcribe inputs in Realtime. This Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.")
        input_audio_microphone = gr.Audio(streaming=True)
        output = gr.Textbox(label="Transcription", value="")

        input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output], time_limit=45, stream_every=2, concurrency_limit=None)

demo.queue().launch()