Spaces:

freddyaboulton
/

gemini-audio-video-chat

Running

App Files Files Community

ahundt commited on Feb 21

Commit

b7a0a78

1 Parent(s): e45ab03

try more robust code (not working yet), notifications about status

Browse files

Files changed (3) hide show

app.py +255 -85
pyproject.toml +11 -0
uv.lock +0 -0

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import base64
 import os
 import time
 from io import BytesIO
 import gradio as gr
 import numpy as np
@@ -17,21 +19,82 @@ from gradio_webrtc import (
 )
 from PIL import Image
-def encode_audio(data: np.ndarray) -> dict:
-    """Encode Audio data to send to the server"""
-    return {"mime_type": "audio/pcm", "data": base64.b64encode(data.tobytes()).decode("UTF-8")}
 def encode_image(data: np.ndarray) -> dict:
-    with BytesIO() as output_bytes:
-        pil_image = Image.fromarray(data)
-        pil_image.save(output_bytes, "JPEG")
-        bytes_data = output_bytes.getvalue()
-    base64_str = str(base64.b64encode(bytes_data), "utf-8")
-    return {"mime_type": "image/jpeg", "data": base64_str}
 class GeminiHandler(AsyncAudioVideoStreamHandler):
     def __init__(
         self, expected_layout="mono", output_sample_rate=24000, output_frame_size=480
@@ -54,117 +117,224 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
             output_sample_rate=self.output_sample_rate,
             output_frame_size=self.output_frame_size,
         )
     async def video_receive(self, frame: np.ndarray):
         if self.session:
-            # send image every 1 second
-            if time.time() - self.last_frame_time > 1:
-                self.last_frame_time = time.time()
-                await self.session.send(encode_image(frame))
-                if self.latest_args[2] is not None:
-                    await self.session.send(encode_image(self.latest_args[2]))
-        self.video_queue.put_nowait(frame)
     async def video_emit(self) -> VideoEmitType:
-        return await self.video_queue.get()
     async def connect(self, api_key: str):
         if self.session is None:
-            client = genai.Client(api_key=api_key, http_options={"api_version": "v1alpha"})
-            config = {"response_modalities": ["AUDIO"]}
-            async with client.aio.live.connect(
-                model="gemini-2.0-flash-exp", config=config
-            ) as session:
-                self.session = session
-                asyncio.create_task(self.receive_audio())
-                await self.quit.wait()
     async def generator(self):
         while not self.quit.is_set():
-            turn = self.session.receive()
-            async for response in turn:
-                if data := response.data:
-                    yield data
     async def receive_audio(self):
-        async for audio_response in async_aggregate_bytes_to_16bit(
-            self.generator()
-        ):
-            self.audio_queue.put_nowait(audio_response)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
-        audio_message = encode_audio(array)
-        if self.session:
-            await self.session.send(audio_message)
     async def emit(self) -> AudioEmitType:
         if not self.args_set.is_set():
             await self.wait_for_args()
         if self.session is None:
             asyncio.create_task(self.connect(self.latest_args[1]))
-        array = await self.audio_queue.get()
-        return (self.output_sample_rate, array)
     def shutdown(self) -> None:
         self.quit.set()
         self.connection = None
         self.args_set.clear()
         self.quit.clear()
 css = """
 #video-source {max-width: 600px !important; max-height: 600 !important;}
 """
-with gr.Blocks(css=css) as demo:
-    gr.HTML(
-        """
-    <div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
-        <div style="background-color: var(--block-background-fill); border-radius: 8px">
-            <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
-        </div>
-        <div>
-            <h1>Gen AI SDK Voice Chat</h1>
-            <p>Speak with Gemini using real-time audio + video streaming</p>
-            <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
-            <p>Get an API Key <a href="https://support.google.com/googleapi/answer/6158862?hl=en">here</a></p>
         </div>
-    </div>
-    """
-    )
-    with gr.Row() as api_key_row:
-        api_key = gr.Textbox(label="API Key", type="password", placeholder="Enter your API Key", value=os.getenv("GOOGLE_API_KEY"))
-    with gr.Row(visible=False) as row:
-        with gr.Column():
-            webrtc = WebRTC(
-                label="Video Chat",
-                modality="audio-video",
-                mode="send-receive",
-                elem_id="video-source",
-                rtc_configuration=get_twilio_turn_credentials(),
-                icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
-                pulse_color="rgb(35, 157, 225)",
-                icon_button_color="rgb(35, 157, 225)",
-            )
-        with gr.Column():
-            image_input = gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
-        webrtc.stream(
-            GeminiHandler(),
-            inputs=[webrtc, api_key, image_input],
-            outputs=[webrtc],
-            time_limit=90,
-            concurrency_limit=2,
         )
         api_key.submit(
-        lambda: (gr.update(visible=False), gr.update(visible=True)),
-        None,
-        [api_key_row, row],
-    )
-if __name__ == "__main__":
     demo.launch()

 import os
 import time
 from io import BytesIO
+import logging
+import traceback  # Import traceback
 import gradio as gr
 import numpy as np
 )
 from PIL import Image
+# --- Setup Logging ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# --- Global State ---
+twilio_available = None  # None = not checked, True = available, False = unavailable
+gemini_connected = False  # Track Gemini connection status
+load_complete = asyncio.Event()  # Event to signal demo.load completion
+# --- Helper Functions ---
+def encode_audio(data: np.ndarray) -> dict:
+    """Encode Audio data to send to the server."""
+    if not isinstance(data, np.ndarray):
+        raise TypeError("encode_audio expected a numpy.ndarray")
+    try:
+        return {"mime_type": "audio/pcm", "data": base64.b64encode(data.tobytes()).decode("UTF-8")}
+    except Exception as e:
+        logger.error(f"Error encoding audio: {e}")
+        raise  # Re-raise the exception after logging
 def encode_image(data: np.ndarray) -> dict:
+    """Encode Image data to send to the server."""
+    if not isinstance(data, np.ndarray):
+        raise TypeError("encode_image expected a numpy.ndarray")
+    try:
+        with BytesIO() as output_bytes:
+            pil_image = Image.fromarray(data)
+            pil_image.save(output_bytes, "JPEG")
+            bytes_data = output_bytes.getvalue()
+        base64_str = str(base64.b64encode(bytes_data), "utf-8")
+        return {"mime_type": "image/jpeg", "data": base64_str}
+    except Exception as e:
+        logger.error(f"Error encoding image: {e}")
+        raise
+async def check_twilio_availability() -> bool:
+    """Checks Twilio TURN server availability with retries and timeout."""
+    global twilio_available
+    timeout = 10
+    retries = 3
+    delay = 2
+    try:
+        async with asyncio.timeout(timeout):
+            for attempt in range(retries):
+                try:
+                    # VERY DETAILED LOGGING HERE
+                    logger.info(f"Attempting to get Twilio credentials (attempt {attempt + 1})...")
+                    credentials = get_twilio_turn_credentials()
+                    logger.info(f"Twilio credentials response: {credentials}")  # Log the response
+                    if credentials:
+                        twilio_available = True
+                        logger.info("Twilio TURN server available.")
+                        return True
+                except Exception as e:
+                    logger.warning(f"Attempt {attempt + 1} to get Twilio credentials failed: {e}")
+                    # Print the full traceback
+                    logger.warning(traceback.format_exc())
+                    if attempt < retries - 1:
+                        await asyncio.sleep(delay)
+            twilio_available = False
+            logger.warning("Twilio TURN server unavailable after multiple attempts.")
+            return False
+    except asyncio.TimeoutError:
+        twilio_available = False
+        logger.error(f"Twilio TURN server check timed out after {timeout} seconds.")
+        return False
+    except Exception as e:
+        twilio_available = False
+        logger.exception(f"Unexpected error checking Twilio availability: {e}")
+        return False
+# --- Gemini Handler Class ---
 class GeminiHandler(AsyncAudioVideoStreamHandler):
     def __init__(
         self, expected_layout="mono", output_sample_rate=24000, output_frame_size=480
             output_sample_rate=self.output_sample_rate,
             output_frame_size=self.output_frame_size,
         )
     async def video_receive(self, frame: np.ndarray):
         if self.session:
+            try:
+                # send image every 1 second
+                if time.time() - self.last_frame_time > 1:
+                    self.last_frame_time = time.time()
+                    await self.session.send(encode_image(frame))
+                    if self.latest_args[2] is not None:
+                        await self.session.send(encode_image(self.latest_args[2]))
+            except Exception as e:
+                logger.error(f"Error sending video frame: {e}")
+                gr.Warning("Error sending video to Gemini. Check your connection and API key.")
+        self.video_queue.put_nowait(frame)  # Always put the frame in the queue
     async def video_emit(self) -> VideoEmitType:
+        try:
+            return await self.video_queue.get()
+        except asyncio.CancelledError:
+            logger.info("Video emit cancelled.")
+            return None # Or some other default value
+        except Exception as e:
+            logger.exception(f"Error in video_emit: {e}")
+            return None
     async def connect(self, api_key: str):
+        global gemini_connected
         if self.session is None:
+            try:
+                client = genai.Client(api_key=api_key, http_options={"api_version": "v1alpha"})
+                config = {"response_modalities": ["AUDIO"]}
+                async with client.aio.live.connect(
+                    model="gemini-2.0-flash-exp", config=config
+                ) as session:
+                    self.session = session
+                    gemini_connected = True
+                    asyncio.create_task(self.receive_audio())
+                    await self.quit.wait()
+            except Exception as e:
+                logger.error(f"Error connecting to Gemini: {e}")
+                gemini_connected = False  # Set connection status to False
+                self.shutdown()
+                # Display error in the UI
+                gr.Warning(f"Failed to connect to Gemini: {e}")
+            finally:  # Update UI *after* connection attempt (both success and failure)
+                gr.Info(f"Gemini connection status: {'Connected' if gemini_connected else 'Disconnected'}")
     async def generator(self):
+        if not self.session:  # Check if session exists
+            logger.warning("Gemini session is not initialized.")
+            return  # Or raise an exception, depending on desired behavior
         while not self.quit.is_set():
+            try:
+                turn = await self.session.receive()
+                async for response in turn:
+                    if data := response.data:
+                        yield data
+            except Exception as e:
+                logger.error(f"Error receiving from Gemini: {e}")
+                gr.Warning("Error communicating with Gemini.  Check network and API key.")
+                break # Exit the loop on error
     async def receive_audio(self):
+        try:
+            async for audio_response in async_aggregate_bytes_to_16bit(
+                self.generator()
+            ):
+                self.audio_queue.put_nowait(audio_response)
+        except asyncio.CancelledError:
+            logger.info("Audio receive cancelled.")
+        except Exception as e:
+            logger.exception(f"Error in receive_audio: {e}")
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
+        try:
+            audio_message = encode_audio(array)
+            if self.session:
+                await self.session.send(audio_message)
+        except Exception as e:
+            logger.error(f"Error sending audio: {e}")
+            gr.Warning("Error sending audio to Gemini. Check your connection and API key.")
     async def emit(self) -> AudioEmitType:
         if not self.args_set.is_set():
             await self.wait_for_args()
         if self.session is None:
+          try:
             asyncio.create_task(self.connect(self.latest_args[1]))
+          except Exception as e:
+                logger.error(f"emit error connecting: {e}")
+        try:
+            array = await self.audio_queue.get()
+            return (self.output_sample_rate, array)
+        except asyncio.CancelledError:
+            logger.info("Audio emit cancelled.")
+            return (self.output_sample_rate, np.array([]))
+        except Exception as e:
+            logger.exception(f"Error in emit: {e}")
+            return (self.output_sample_rate, np.array([]))  # Return empty array on error
     def shutdown(self) -> None:
+        global gemini_connected
+        gemini_connected = False # Reset on shutdown
+        logger.info("Shutting down GeminiHandler.")
         self.quit.set()
         self.connection = None
         self.args_set.clear()
+        if self.session:
+             # No good async close method, this can get stuck.
+            #  asyncio.create_task(self.session.close())
+            pass
         self.quit.clear()
+        gr.Info("Gemini connection closed.")
+# --- Gradio UI ---
 css = """
 #video-source {max-width: 600px !important; max-height: 600 !important;}
 """
+async def main():
+    global twilio_available, gemini_connected
+    with gr.Blocks(css=css) as demo:
+        gr.HTML(
+            """
+        <div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
+            <div style="background-color: var(--block-background-fill); border-radius: 8px">
+                <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
+            </div>
+            <div>
+                <h1>Gen AI SDK Voice Chat</h1>
+                <p>Speak with Gemini using real-time audio + video streaming</p>
+                <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
+                <p>Get an API Key <a href="https://support.google.com/googleapi/answer/6158862?hl=en">here</a></p>
+            </div>
         </div>
+        """
         )
+        twilio_status_message = gr.Markdown("")  # For displaying Twilio status
+        gemini_status_message = gr.Markdown("")  # For Gemini status
+        with gr.Row() as api_key_row:
+            api_key = gr.Textbox(
+                label="API Key",
+                type="password",
+                placeholder="Enter your API Key",
+                value=os.getenv("GOOGLE_API_KEY"),
+            )
+        with gr.Row(visible=False) as row:
+            with gr.Column():
+                webrtc = WebRTC(
+                    label="Video Chat",
+                    modality="audio-video",
+                    mode="send-receive",
+                    elem_id="video-source",
+                    rtc_configuration={"iceServers": []},  # DUMMY CONFIGURATION
+                    icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
+                    pulse_color="rgb(35, 157, 225)",
+                    icon_button_color="rgb(35, 157, 225)",
+                )
+            with gr.Column():
+                image_input = gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
+        async def update_twilio_status_and_ui():
+            """Updates Twilio status and UI elements."""
+            await check_twilio_availability()  # Check Twilio availability
+            if twilio_available:
+                rtc_config = get_twilio_turn_credentials()
+                message = "Twilio TURN server available.  Connection should be reliable."
+            else:
+                rtc_config = None
+                message = "**Warning:** Twilio TURN server unavailable.  Connection might be less reliable or fail if you are behind a symmetric NAT."
+            load_complete.set()  # Signal that load is complete - *before* returning
+            return gr.update(rtc_configuration=rtc_config), gr.update(value=message)
+        # Check Twilio availability and update UI on startup.
+        demo.load(update_twilio_status_and_ui, [], [webrtc, twilio_status_message])
+        async def start_streaming():
+            """Starts the WebRTC streaming after load_complete is set."""
+            await load_complete.wait()  # *Wait* for load to complete
+            await asyncio.sleep(0.1)     # Small delay (optional, but can help)
+            webrtc.stream(
+                GeminiHandler(),
+                inputs=[webrtc, api_key, image_input],
+                outputs=[webrtc],
+                time_limit=90,
+                concurrency_limit=None,  # Removed concurrency limit
+            )
+        # Use .then() to chain start_streaming *after* demo.load
+        demo.load(None, [], []).then(start_streaming, [], [])
+        def check_api_key(api_key_str):
+            if not api_key_str:
+                return gr.update(visible=True), gr.update(visible=False), gr.update(value="Please enter a valid API key")
+            return gr.update(visible=False), gr.update(visible=True), gr.update(value="")
         api_key.submit(
+            check_api_key,
+            [api_key],
+            [api_key_row, row, twilio_status_message],
+        )
     demo.launch()
+if __name__ == "__main__":
+    asyncio.run(main())

pyproject.toml ADDED Viewed

	@@ -0,0 +1,11 @@

+[project]
+name = "gemini-audio-video-chat"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "gradio_webrtc==0.0.28",
+    "google-genai==0.3.0",
+    "twilio"
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff