Spaces:

freddyaboulton
/

gemini-audio-video-chat

Running

App Files Files Community

skip_key_show_status

by ahundt - opened 5 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+1554

-56

Files changed (3) hide show

app.py +265 -56
pyproject.toml +12 -0
uv.lock +0 -0

app.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import asyncio
 import base64
 import os
 import time
-from io import BytesIO
 import gradio as gr
 import numpy as np
@@ -15,23 +20,109 @@ from gradio_webrtc import (
     AudioEmitType,
     get_twilio_turn_credentials,
 )
-from PIL import Image
 def encode_audio(data: np.ndarray) -> dict:
-    """Encode Audio data to send to the server"""
-    return {"mime_type": "audio/pcm", "data": base64.b64encode(data.tobytes()).decode("UTF-8")}
-def encode_image(data: np.ndarray) -> dict:
-    with BytesIO() as output_bytes:
-        pil_image = Image.fromarray(data)
-        pil_image.save(output_bytes, "JPEG")
-        bytes_data = output_bytes.getvalue()
-    base64_str = str(base64.b64encode(bytes_data), "utf-8")
     return {"mime_type": "image/jpeg", "data": base64_str}
 class GeminiHandler(AsyncAudioVideoStreamHandler):
     def __init__(
         self, expected_layout="mono", output_sample_rate=24000, output_frame_size=480
@@ -54,71 +145,142 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
             output_sample_rate=self.output_sample_rate,
             output_frame_size=self.output_frame_size,
         )
     async def video_receive(self, frame: np.ndarray):
         if self.session:
-            # send image every 1 second
-            if time.time() - self.last_frame_time > 1:
-                self.last_frame_time = time.time()
-                await self.session.send(encode_image(frame))
-                if self.latest_args[2] is not None:
-                    await self.session.send(encode_image(self.latest_args[2]))
         self.video_queue.put_nowait(frame)
     async def video_emit(self) -> VideoEmitType:
-        return await self.video_queue.get()
     async def connect(self, api_key: str):
         if self.session is None:
-            client = genai.Client(api_key=api_key, http_options={"api_version": "v1alpha"})
-            config = {"response_modalities": ["AUDIO"]}
-            async with client.aio.live.connect(
-                model="gemini-2.0-flash-exp", config=config
-            ) as session:
-                self.session = session
-                asyncio.create_task(self.receive_audio())
-                await self.quit.wait()
     async def generator(self):
         while not self.quit.is_set():
-            turn = self.session.receive()
-            async for response in turn:
-                if data := response.data:
-                    yield data
     async def receive_audio(self):
-        async for audio_response in async_aggregate_bytes_to_16bit(
-            self.generator()
-        ):
-            self.audio_queue.put_nowait(audio_response)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
-        audio_message = encode_audio(array)
-        if self.session:
-            await self.session.send(audio_message)
     async def emit(self) -> AudioEmitType:
         if not self.args_set.is_set():
             await self.wait_for_args()
         if self.session is None:
             asyncio.create_task(self.connect(self.latest_args[1]))
-        array = await self.audio_queue.get()
-        return (self.output_sample_rate, array)
     def shutdown(self) -> None:
-        self.quit.set()
         self.connection = None
         self.args_set.clear()
         self.quit.clear()
 css = """
 #video-source {max-width: 600px !important; max-height: 600 !important;}
 """
 with gr.Blocks(css=css) as demo:
     gr.HTML(
         """
@@ -135,16 +297,29 @@ with gr.Blocks(css=css) as demo:
     </div>
     """
     )
     with gr.Row() as api_key_row:
-        api_key = gr.Textbox(label="API Key", type="password", placeholder="Enter your API Key", value=os.getenv("GOOGLE_API_KEY"))
     with gr.Row(visible=False) as row:
         with gr.Column():
             webrtc = WebRTC(
                 label="Video Chat",
                 modality="audio-video",
                 mode="send-receive",
                 elem_id="video-source",
-                rtc_configuration=get_twilio_turn_credentials(),
                 icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
                 pulse_color="rgb(35, 157, 225)",
                 icon_button_color="rgb(35, 157, 225)",
@@ -152,19 +327,53 @@ with gr.Blocks(css=css) as demo:
         with gr.Column():
             image_input = gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
-        webrtc.stream(
-            GeminiHandler(),
-            inputs=[webrtc, api_key, image_input],
-            outputs=[webrtc],
-            time_limit=90,
-            concurrency_limit=2,
         )
-        api_key.submit(
-        lambda: (gr.update(visible=False), gr.update(visible=True)),
-        None,
-        [api_key_row, row],
     )
-if __name__ == "__main__":
-    demo.launch()

+# https://huggingface.co/spaces/freddyaboulton/gemini-audio-video-chat
+# related demos: https://github.com/freddyaboulton/gradio-webrtc
 import asyncio
 import base64
 import os
 import time
+import logging
+import traceback
+import cv2
 import gradio as gr
 import numpy as np
     AudioEmitType,
     get_twilio_turn_credentials,
 )
+import requests  # Use requests for synchronous Twilio check
+# --- Setup Logging ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# --- Global State ---
+twilio_available = None  # Will be set *before* Gradio initialization
+gemini_connected = False
+# --- Helper Functions ---
 def encode_audio(data: np.ndarray) -> dict:
+    if not isinstance(data, np.ndarray):
+        raise TypeError("encode_audio expected a numpy.ndarray")
+    try:
+        return {"mime_type": "audio/pcm", "data": base64.b64encode(data.tobytes()).decode("UTF-8")}
+    except Exception as e:
+        logger.error(f"Error encoding audio: {e}")
+        raise
+def encode_image(data: np.ndarray, quality: int = 85) -> dict:
+    """
+    Encodes a NumPy array (image) to a JPEG, Base64-encoded UTF-8 string using OpenCV.
+    Handles various input data types.
+    Args:
+        data: A NumPy array of shape (n, n, 3).
+        quality: JPEG quality (0-100).
+    Returns:
+        A dictionary with keys "mime_type" and "data".
+    Raises:
+        TypeError: If input is not a NumPy array.
+        ValueError: If input shape is incorrect or contains NaN/Inf.
+        Exception: If JPEG encoding fails.
+    """
+    # Input Validation (shape and dimensions)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Input must be a NumPy array.")
+    if data.ndim != 3 or data.shape[2] != 3:
+        raise ValueError("Input array must have shape (n, n, 3).")
+    if 0 in data.shape:
+        raise ValueError("Input array cannot have a dimension of size 0.")
+    # Handle NaN/Inf (regardless of data type)
+    if np.any(np.isnan(data)) or np.any(np.isinf(data)):
+        raise ValueError("Input array contains NaN or Inf")
+    # Normalize and convert to uint8
+    if np.issubdtype(data.dtype, np.floating) or np.issubdtype(data.dtype, np.integer):
+        scaled_data = cv2.normalize(data, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
+    else:
+        raise TypeError("Input array must have a floating-point or integer data type.")
+    # JPEG Encoding (with quality control and error handling)
+    try:
+        retval, buf = cv2.imencode(".jpg", scaled_data, [int(cv2.IMWRITE_JPEG_QUALITY), quality])
+        if not retval:
+            raise Exception("cv2.imencode failed")
+    except Exception as e:
+        raise Exception(f"JPEG encoding failed: {e}")
+    # Base64 Encoding
+    jpeg_bytes = np.array(buf).tobytes()
+    base64_str = base64.b64encode(jpeg_bytes).decode('utf-8')
     return {"mime_type": "image/jpeg", "data": base64_str}
+def check_twilio_availability_sync() -> bool:
+    """Checks Twilio TURN server availability (synchronous version)."""
+    global twilio_available
+    retries = 3
+    delay = 2
+    for attempt in range(retries):
+        try:
+            logger.info(f"Attempting to get Twilio credentials (attempt {attempt + 1})...")
+            credentials = get_twilio_turn_credentials()
+            logger.info(f"Twilio credentials response: {credentials}")
+            if credentials:
+                twilio_available = True
+                logger.info("Twilio TURN server available.")
+                return True
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Attempt {attempt + 1}: {e}")
+            logger.warning(traceback.format_exc())
+            if attempt < retries - 1:
+                time.sleep(delay)
+        except Exception as e:
+            logger.exception(f"Unexpected error checking Twilio: {e}")
+            twilio_available = False
+            return False
+    twilio_available = False
+    logger.warning("Twilio TURN server unavailable.")
+    return False
+# --- Gemini Handler Class ---
 class GeminiHandler(AsyncAudioVideoStreamHandler):
     def __init__(
         self, expected_layout="mono", output_sample_rate=24000, output_frame_size=480
             output_sample_rate=self.output_sample_rate,
             output_frame_size=self.output_frame_size,
         )
     async def video_receive(self, frame: np.ndarray):
         if self.session:
+            try:
+                if time.time() - self.last_frame_time > 1:
+                    self.last_frame_time = time.time()
+                    await self.session.send(encode_image(frame))
+                    if self.latest_args[2] is not None:
+                        await self.session.send(encode_image(self.latest_args[2]))
+            except Exception as e:
+                logger.error(f"Error sending video frame: {e}")
+                gr.Warning("Error sending video to Gemini.")
         self.video_queue.put_nowait(frame)
     async def video_emit(self) -> VideoEmitType:
+        try:
+            return await self.video_queue.get()
+        except asyncio.CancelledError:
+            logger.info("Video emit cancelled.")
+            return None
+        except Exception as e:
+            logger.exception(f"Error in video_emit: {e}")
+            return None
     async def connect(self, api_key: str):
+        global gemini_connected
         if self.session is None:
+            try:
+                client = genai.Client(api_key=api_key, http_options={"api_version": "v1alpha"})
+                config = {"response_modalities": ["AUDIO"]}
+                async with client.aio.live.connect(
+                    model="gemini-2.0-flash-exp", config=config
+                ) as session:
+                    self.session = session
+                    gemini_connected = True
+                    asyncio.create_task(self.receive_audio())
+                    await self.quit.wait()
+            except Exception as e:
+                logger.error(f"Error connecting to Gemini: {e}")
+                gemini_connected = False
+                self.shutdown()
+                gr.Warning(f"Failed to connect to Gemini: {e}")
+            finally:
+                update_gemini_status_sync()
     async def generator(self):
+        if not self.session:
+            logger.warning("Gemini session is not initialized.")
+            return
         while not self.quit.is_set():
+            try:
+                await asyncio.sleep(0)  # Yield to the event loop
+                if self.quit.is_set():
+                    break
+                turn = self.session.receive()
+                async for response in turn:
+                    if self.quit.is_set():
+                        break # Exit inner loop if quit is set.
+                    if data := response.data:
+                        yield data
+            except Exception as e:
+                logger.error(f"Error receiving from Gemini: {e}")
+                self.quit.set() # set quit if we error.
+                break
     async def receive_audio(self):
+        try:
+            async for audio_response in async_aggregate_bytes_to_16bit(self.generator()):
+                self.audio_queue.put_nowait(audio_response)
+        except Exception as e:
+            logger.exception(f"Error in receive_audio: {e}")
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
+        try:
+            audio_message = encode_audio(array)
+            if self.session:
+                await self.session.send(audio_message)
+        except Exception as e:
+            logger.error(f"Error sending audio: {e}")
+            gr.Warning("Error sending audio to Gemini.")
     async def emit(self) -> AudioEmitType:
         if not self.args_set.is_set():
             await self.wait_for_args()
         if self.session is None:
             asyncio.create_task(self.connect(self.latest_args[1]))
+        try:
+            array = await self.audio_queue.get()
+            return (self.output_sample_rate, array)
+        except asyncio.CancelledError:
+            logger.info("Audio emit cancelled.")
+            return (self.output_sample_rate, np.array([]))
+        except Exception as e:
+            logger.exception(f"Error in emit: {e}")
+            return (self.output_sample_rate, np.array([]))
     def shutdown(self) -> None:
+        global gemini_connected
+        gemini_connected = False
+        logger.info("Shutting down GeminiHandler.")
+        if self.session:
+            try:
+                #  await self.session.close()  # There is no async close
+                pass
+            except Exception:
+                pass
+        self.quit.set()  # Set quit *after* attempting to close the session
         self.connection = None
         self.args_set.clear()
         self.quit.clear()
+        update_gemini_status_sync()
+def update_gemini_status_sync():
+    """Updates the Gemini status message (synchronous version)."""
+    status = "✅ Gemini: Connected" if gemini_connected else "❌ Gemini: Disconnected"
+    if 'demo' in locals() and demo.running:
+        gr.update(value=status)
+# --- Gradio UI ---
 css = """
 #video-source {max-width: 600px !important; max-height: 600 !important;}
 """
+# Perform Twilio check *before* Gradio UI definition (synchronously)
+if __name__ == "__main__":
+    check_twilio_availability_sync()
 with gr.Blocks(css=css) as demo:
     gr.HTML(
         """
     </div>
     """
     )
+    twilio_status_message = gr.Markdown("❓ Twilio: Checking...")
+    gemini_status_message = gr.Markdown("❓ Gemini: Checking...")
     with gr.Row() as api_key_row:
+        api_key = gr.Textbox(
+            label="API Key",
+            type="password",
+            placeholder="Enter your API Key",
+            value=os.getenv("GOOGLE_API_KEY"),
+        )
     with gr.Row(visible=False) as row:
         with gr.Column():
+            # Set rtc_configuration based on the *pre-checked* twilio_available
+            rtc_config = get_twilio_turn_credentials() if twilio_available else None
+            # Explicitly specify codecs (example - you might need to adjust)
+            if rtc_config:
+                rtc_config['codecs'] = ['VP8', 'H264']  # Prefer VP8, then H.264
             webrtc = WebRTC(
                 label="Video Chat",
                 modality="audio-video",
                 mode="send-receive",
                 elem_id="video-source",
+                rtc_configuration=rtc_config,
                 icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
                 pulse_color="rgb(35, 157, 225)",
                 icon_button_color="rgb(35, 157, 225)",
         with gr.Column():
             image_input = gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
+    def update_twilio_status_ui():
+        if twilio_available:
+            message = "✅ Twilio: Available"
+        else:
+            message = "❌ Twilio: Unavailable (connection may be less reliable)"
+        return gr.update(value=message)
+    demo.load(update_twilio_status_ui, [], [twilio_status_message])
+    handler = GeminiHandler()
+    webrtc.stream(
+        handler,
+        inputs=[webrtc, api_key, image_input],
+        outputs=[webrtc],
+        time_limit=90,
+        concurrency_limit=None,
+    )
+    def check_api_key(api_key_str):
+        if not api_key_str:
+            return (
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(value="Please enter a valid API key"),
+                gr.update(value="❓ Gemini: Checking..."),
+            )
+        return (
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(value=""),
+            gr.update(value="❓ Gemini: Checking..."),
         )
+    api_key.submit(
+        check_api_key,
+        [api_key],
+        [api_key_row, row, twilio_status_message, gemini_status_message],
     )
+    # If API key is already set via environment variables, hide the API key row and show content
+    if os.getenv("GOOGLE_API_KEY"):
+        demo.load(
+            lambda: (gr.update(visible=False), gr.update(visible=True)),
+            None,
+            [api_key_row, row],
+        )
+demo.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,12 @@

+[project]
+name = "gemini-audio-video-chat"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "gradio_webrtc==0.0.28",
+    "google-genai==0.3.0",
+    "twilio",
+    "opencv-python"
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff