Spaces:
Running
on
T4
Running
on
T4
| import gradio as gr | |
| from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs | |
| import transformers | |
| import numpy as np | |
| from twilio.rest import Client | |
| import os | |
| import torch | |
| import librosa | |
| pipe = transformers.pipeline( | |
| model="reach-vb/smolvox-smollm2-whisper-turbo", | |
| trust_remote_code=True, | |
| device=torch.device("cuda"), | |
| ) | |
| whisper = transformers.pipeline( | |
| model="openai/whisper-large-v3-turbo", device=torch.device("cuda") | |
| ) | |
| account_sid = os.environ.get("TWILIO_ACCOUNT_SID") | |
| auth_token = os.environ.get("TWILIO_AUTH_TOKEN") | |
| if account_sid and auth_token: | |
| client = Client(account_sid, auth_token) | |
| token = client.tokens.create() | |
| rtc_configuration = { | |
| "iceServers": token.ice_servers, | |
| "iceTransportPolicy": "relay", | |
| } | |
| else: | |
| rtc_configuration = None | |
| def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], conversation: list[dict]): | |
| original_sr = audio[0] | |
| target_sr = 16000 | |
| audio_sr = librosa.resample( | |
| audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr | |
| ) | |
| tf_input = [d for d in transformers_chat] | |
| # Generate a response from the pipeline using the audio input | |
| output = pipe( | |
| {"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr}, | |
| max_new_tokens=2048, | |
| ) | |
| # Transcribe the audio using Whisper | |
| transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr}) | |
| # Update both conversation histories | |
| conversation.append({"role": "user", "content": transcription["text"]}) | |
| conversation.append({"role": "assistant", "content": output}) | |
| transformers_chat.append({"role": "user", "content": transcription["text"]}) | |
| transformers_chat.append({"role": "assistant", "content": output}) | |
| yield AdditionalOutputs(transformers_chat, conversation) | |
| def respond_text(user_text: str, transformers_chat: list[dict], conversation: list[dict]): | |
| if not user_text.strip(): | |
| return transformers_chat, conversation | |
| # Append the user message from the textbox | |
| conversation.append({"role": "user", "content": user_text}) | |
| transformers_chat.append({"role": "user", "content": user_text}) | |
| # Generate a response using the pipeline. We assume it can process text input via "text" | |
| output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512) | |
| conversation.append({"role": "assistant", "content": output}) | |
| transformers_chat.append({"role": "assistant", "content": output}) | |
| return transformers_chat, conversation | |
| with gr.Blocks() as demo: | |
| gr.HTML( | |
| """ | |
| <h1 style='text-align: center'> | |
| Talk to Smolvox Smollm2 1.7b (Powered by WebRTC ⚡️) | |
| </h1> | |
| <p style='text-align: center'> | |
| Once you grant access to your microphone, you can talk naturally to Ultravox. | |
| When you stop talking, the audio will be sent for processing. | |
| </p> | |
| <p style='text-align: center'> | |
| Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation. | |
| </p> | |
| """ | |
| ) | |
| # Shared conversation state | |
| transformers_chat = gr.State( | |
| value=[ | |
| { | |
| "role": "system", | |
| "content": "You are a friendly and helpful character. You love to answer questions for people.", | |
| } | |
| ] | |
| ) | |
| # Chat transcript at the top | |
| transcript = gr.Chatbot(label="Transcript", type="messages") | |
| # Lower row: text input and audio input side by side | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.Textbox( | |
| placeholder="Type your message here and press Enter...", label="Your Message" | |
| ) | |
| with gr.Column(scale=1): | |
| audio = WebRTC( | |
| rtc_configuration=rtc_configuration, | |
| label="Stream", | |
| mode="send", | |
| modality="audio", | |
| ) | |
| # Audio stream: process audio when speaking stops. | |
| audio.stream( | |
| ReplyOnPause(transcribe), | |
| inputs=[audio, transformers_chat, transcript], | |
| outputs=[audio], | |
| time_limit=90, | |
| ) | |
| audio.on_additional_outputs( | |
| lambda t, g: (t, g), | |
| outputs=[transformers_chat, transcript], | |
| queue=False, | |
| show_progress="hidden", | |
| ) | |
| # Text input: submit callback when pressing Enter. | |
| text_input.submit( | |
| respond_text, | |
| inputs=[text_input, transformers_chat, transcript], | |
| outputs=[transformers_chat, transcript], | |
| ) | |
| # Clear text input after submission. | |
| text_input.submit(lambda: "", inputs=[], outputs=[text_input]) | |
| if __name__ == "__main__": | |
| demo.launch() | |