talk-to-smolvox

Running on T4

App Files Files Community

Steveeeeeeen HF Staff commited on Feb 14

Commit

c567179

verified ·

1 Parent(s): 8baccb8

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -17

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ from twilio.rest import Client
 import os
 import torch
 import librosa
 pipe = transformers.pipeline(
     model="reach-vb/smolvox-smollm2-whisper-turbo",
@@ -21,7 +23,9 @@ auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
 if account_sid and auth_token:
     client = Client(account_sid, auth_token)
     token = client.tokens.create()
     rtc_configuration = {
         "iceServers": token.ice_servers,
         "iceTransportPolicy": "relay",
@@ -29,8 +33,12 @@ if account_sid and auth_token:
 else:
     rtc_configuration = None
-def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], conversation: list[dict]):
     original_sr = audio[0]
     target_sr = 16000
@@ -40,7 +48,7 @@ def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], con
     tf_input = [d for d in transformers_chat]
-    # Generate a response from the pipeline using the audio input
     output = pipe(
         {"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
         max_new_tokens=512,
@@ -56,16 +64,22 @@ def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], con
     yield AdditionalOutputs(transformers_chat, conversation)
-def respond_text(user_text: str, transformers_chat: list[dict], conversation: list[dict]):
     if not user_text.strip():
         return transformers_chat, conversation
     # Append the user message from the textbox
     conversation.append({"role": "user", "content": user_text})
     transformers_chat.append({"role": "user", "content": user_text})
-    # Generate a response using the pipeline. We assume it can process text input via "text"
     output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512)
     conversation.append({"role": "assistant", "content": output})
@@ -88,7 +102,6 @@ with gr.Blocks() as demo:
         </p>
         """
     )
     # Shared conversation state
     transformers_chat = gr.State(
         value=[
@@ -99,15 +112,13 @@ with gr.Blocks() as demo:
         ]
     )
-    # Chat transcript at the top
-    transcript = gr.Chatbot(label="Transcript", type="messages")
-    # Lower row: text input and audio input side by side
     with gr.Row():
         with gr.Column(scale=1):
             text_input = gr.Textbox(
-                placeholder="Type your message here and press Enter...", label="Your Message"
             )
         with gr.Column(scale=1):
             audio = WebRTC(
                 rtc_configuration=rtc_configuration,
@@ -116,7 +127,7 @@ with gr.Blocks() as demo:
                 modality="audio",
             )
-    # Audio stream: process audio when speaking stops.
     audio.stream(
         ReplyOnPause(transcribe),
         inputs=[audio, transformers_chat, transcript],
@@ -130,14 +141,14 @@ with gr.Blocks() as demo:
         show_progress="hidden",
     )
-    # Text input: submit callback when pressing Enter.
-    text_input.submit(
         respond_text,
         inputs=[text_input, transformers_chat, transcript],
         outputs=[transformers_chat, transcript],
     )
-    # Clear text input after submission.
-    text_input.submit(lambda: "", inputs=[], outputs=[text_input])
 if __name__ == "__main__":
     demo.launch()

 import os
 import torch
 import librosa
+import spaces
 pipe = transformers.pipeline(
     model="reach-vb/smolvox-smollm2-whisper-turbo",
 if account_sid and auth_token:
     client = Client(account_sid, auth_token)
     token = client.tokens.create()
     rtc_configuration = {
         "iceServers": token.ice_servers,
         "iceTransportPolicy": "relay",
 else:
     rtc_configuration = None
+@spaces.GPU(duration=90)
+def transcribe(
+    audio: tuple[int, np.ndarray],
+    transformers_chat: list[dict],
+    conversation: list[dict],
+):
     original_sr = audio[0]
     target_sr = 16000
     tf_input = [d for d in transformers_chat]
+    # Generate response from the pipeline using the audio input
     output = pipe(
         {"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
         max_new_tokens=512,
     yield AdditionalOutputs(transformers_chat, conversation)
+@spaces.GPU(duration=90)
+def respond_text(
+    user_text: str,
+    transformers_chat: list[dict],
+    conversation: list[dict],
+):
     if not user_text.strip():
+        # Do nothing if the textbox is empty
         return transformers_chat, conversation
     # Append the user message from the textbox
     conversation.append({"role": "user", "content": user_text})
     transformers_chat.append({"role": "user", "content": user_text})
+    # Generate a response using the pipeline.
+    # Here we assume the pipeline can also process text input via the "text" key.
     output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512)
     conversation.append({"role": "assistant", "content": output})
         </p>
         """
     )
     # Shared conversation state
     transformers_chat = gr.State(
         value=[
         ]
     )
     with gr.Row():
         with gr.Column(scale=1):
+            transcript = gr.Chatbot(label="Transcript", type="messages")
             text_input = gr.Textbox(
+                placeholder="Type your message here...", label="Your Message"
             )
+            send_button = gr.Button("Send")
         with gr.Column(scale=1):
             audio = WebRTC(
                 rtc_configuration=rtc_configuration,
                 modality="audio",
             )
+    # Audio stream: when you stop speaking, process the audio input.
     audio.stream(
         ReplyOnPause(transcribe),
         inputs=[audio, transformers_chat, transcript],
         show_progress="hidden",
     )
+    # Text input: when you click "Send", process the typed message.
+    send_button.click(
         respond_text,
         inputs=[text_input, transformers_chat, transcript],
         outputs=[transformers_chat, transcript],
     )
+    # Optionally clear the text box after sending:
+    send_button.click(lambda: "", inputs=[], outputs=[text_input])
 if __name__ == "__main__":
     demo.launch()