KingNish commited on
Commit
e19d3c8
·
verified ·
1 Parent(s): 260b031

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -17
app.py CHANGED
@@ -1,50 +1,68 @@
1
  import spaces
2
  import torch
3
  import gradio as gr
4
- from transformers import pipeline
5
  import tempfile
6
  import os
7
  import uuid
8
  import scipy.io.wavfile
 
 
9
 
 
 
10
  MODEL_NAME = "ylacombe/whisper-large-v3-turbo"
11
- BATCH_SIZE = 4
12
- device = 0 if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
13
 
14
  pipe = pipeline(
15
  task="automatic-speech-recognition",
16
- model=MODEL_NAME,
17
- chunk_length_s=30,
 
 
 
18
  device=device,
19
  )
20
 
21
  @spaces.GPU
22
  def transcribe(inputs, previous_transcription):
 
23
  try:
24
- # Generate a unique filename Using UUID
25
  filename = f"{uuid.uuid4().hex}.wav"
26
-
27
- # Extract Sample Rate and Audio Data from the Tuple
28
  sample_rate, audio_data = inputs
29
-
30
- # Save the Audio Data to the Temporary File
31
  scipy.io.wavfile.write(filename, sample_rate, audio_data)
32
 
33
- # Transcribe the Audio
34
- transcription = pipe(filename, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=False)["text"]
35
  previous_transcription += transcription
36
 
37
- return previous_transcription
 
 
38
  except Exception as e:
39
  print(f"Error during Transcription: {e}")
40
- return previous_transcription
 
 
 
 
41
 
42
  with gr.Blocks() as demo:
43
  with gr.Column():
44
  gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
45
- input_audio_microphone = gr.Audio(streaming=True)
46
- output = gr.Textbox(label="Transcription", value="")
 
 
 
 
47
 
48
- input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output], time_limit=45, stream_every=2, concurrency_limit=None)
 
49
 
50
  demo.launch()
 
1
  import spaces
2
  import torch
3
  import gradio as gr
 
4
  import tempfile
5
  import os
6
  import uuid
7
  import scipy.io.wavfile
8
+ import time
9
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
10
 
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ torch_dtype = torch.float16
13
  MODEL_NAME = "ylacombe/whisper-large-v3-turbo"
14
+
15
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
+ MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
17
+ )
18
+ model.to(device)
19
+
20
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
21
+ tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language="en")
22
 
23
  pipe = pipeline(
24
  task="automatic-speech-recognition",
25
+ model=model,
26
+ tokenizer=tokenizer,
27
+ feature_extractor=processor.feature_extractor,
28
+ max_new_tokens=25,
29
+ torch_dtype=torch_dtype,
30
  device=device,
31
  )
32
 
33
  @spaces.GPU
34
  def transcribe(inputs, previous_transcription):
35
+ start_time = time.time()
36
  try:
 
37
  filename = f"{uuid.uuid4().hex}.wav"
 
 
38
  sample_rate, audio_data = inputs
 
 
39
  scipy.io.wavfile.write(filename, sample_rate, audio_data)
40
 
41
+ transcription = pipe(filename)["text"]
 
42
  previous_transcription += transcription
43
 
44
+ end_time = time.time()
45
+ latency = end_time - start_time
46
+ return previous_transcription, str(latency:.2f)
47
  except Exception as e:
48
  print(f"Error during Transcription: {e}")
49
+ return previous_transcription, "Error"
50
+
51
+
52
+ def clear():
53
+ return ""
54
 
55
  with gr.Blocks() as demo:
56
  with gr.Column():
57
  gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
58
+ with gr.Row():
59
+ input_audio_microphone = gr.Audio(streaming=True)
60
+ output = gr.Textbox(label="Transcription", value="")
61
+ latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
62
+ with gr.Row():
63
+ clear_button = gr.Button("Clear Output")
64
 
65
+ input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
66
+ clear_button.click(clear, outputs=[output])
67
 
68
  demo.launch()