import gradio as gr import cv2 import numpy as np import base64 import requests import json import time import threading from PIL import Image import io class CameraProcessor: def __init__(self): self.is_processing = False self.processing_thread = None self.stop_event = threading.Event() def encode_image_to_base64(self, image): """Convert numpy array to base64 string""" if image is None: return None # Convert from RGB to BGR for OpenCV image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Encode image to JPEG _, buffer = cv2.imencode('.jpg', image_bgr, [cv2.IMWRITE_JPEG_QUALITY, 80]) # Convert to base64 image_base64 = base64.b64encode(buffer).decode('utf-8') return f"data:image/jpeg;base64,{image_base64}" async def send_chat_completion_request(self, instruction, image_base64_url, base_url): """Send request to chat completion API""" try: payload = { "max_tokens": 100, "messages": [ { "role": "user", "content": [ {"type": "text", "text": instruction}, { "type": "image_url", "image_url": {"url": image_base64_url} } ] } ] } response = requests.post( f"{base_url}/v1/chat/completions", headers={"Content-Type": "application/json"}, json=payload, timeout=10 ) if not response.ok: return f"Server error: {response.status_code} - {response.text}" data = response.json() return data["choices"][0]["message"]["content"] except Exception as e: return f"Error: {str(e)}" def process_frame(self, instruction, image, base_url): """Process a single frame""" print(f"DEBUG: process_frame called with base_url: {base_url}") if image is None: print("DEBUG: No image captured") return "No image captured" image_base64 = self.encode_image_to_base64(image) if not image_base64: print("DEBUG: Failed to encode image") return "Failed to encode image" print(f"DEBUG: Sending request to {base_url}/v1/chat/completions") # Since Gradio doesn't support async in interface functions easily, # we'll use requests directly try: payload = { "max_tokens": 100, "messages": [ { "role": "user", "content": [ {"type": "text", "text": instruction}, { "type": "image_url", "image_url": {"url": image_base64} } ] } ] } print("DEBUG: Making HTTP request...") response = requests.post( f"{base_url}/v1/chat/completions", headers={"Content-Type": "application/json"}, json=payload, timeout=10 ) print(f"DEBUG: Response status: {response.status_code}") if not response.ok: error_msg = f"Server error: {response.status_code} - {response.text}" print(f"DEBUG: {error_msg}") return error_msg data = response.json() result = data["choices"][0]["message"]["content"] print(f"DEBUG: Success - got response: {result}") return result except Exception as e: error_msg = f"Error: {str(e)}" print(f"DEBUG: Exception occurred: {error_msg}") return error_msg # Initialize processor processor = CameraProcessor() def process_image(instruction, image, base_url): """Main processing function for Gradio interface""" print(f"DEBUG: process_image called - is_processing: {processor.is_processing}") print(f"DEBUG: instruction: '{instruction}'") print(f"DEBUG: base_url: '{base_url}'") print(f"DEBUG: image is None: {image is None}") print(f"DEBUG: image type: {type(image)}") # Always return something to test if function is being called if image is None: print("DEBUG: No image from webcam") return "No image from webcam - check camera permissions or try a different browser" # For manual testing, skip the processing state check # if not processor.is_processing: # print("DEBUG: Not processing - returning early") # return "Click Start to begin processing" if not instruction.strip(): print("DEBUG: No instruction provided") return "Please enter an instruction" if not base_url.strip(): print("DEBUG: No base URL provided") return "Please enter a base URL" print("DEBUG: Calling process_frame") result = processor.process_frame(instruction, image, base_url) print(f"DEBUG: process_frame result: {result}") return result def toggle_processing(): """Toggle processing state""" processor.is_processing = not processor.is_processing print(f"DEBUG: Processing toggled to: {processor.is_processing}") if processor.is_processing: return "Stop", "Processing started..." else: return "Start", "Processing stopped." def update_stream_interval(interval): """Update streaming interval""" return gr.update(stream_every=interval) def test_api_connection(base_url): """Test if API server is reachable""" try: response = requests.get(f"{base_url}/health", timeout=5) return f"API accessible: {response.status_code}" except Exception as e: return f"API connection failed: {str(e)}" # Create Gradio interface with gr.Blocks(title="Camera Interaction App", theme=gr.themes.Soft()) as interface: gr.Markdown("# Camera Interaction App") gr.Markdown("**Note:** Make sure to grant camera permissions in your browser!") with gr.Row(): # Video input video_input = gr.Image( sources=["webcam"], label="Camera Feed - Click to capture", width=480, height=360 ) with gr.Column(): # Base URL input base_url_input = gr.Textbox( label="Base API URL", value="http://localhost:8080", placeholder="Enter API base URL" ) # Instruction input instruction_input = gr.Textbox( label="Instruction", value="What do you see?", placeholder="Enter your instruction", lines=2 ) # Response output response_output = gr.Textbox( label="Response", value="1. Grant camera permissions\n2. Capture a photo\n3. Click Process Image", interactive=False, lines=3 ) with gr.Row(): # Manual process button for testing process_button = gr.Button("Process Image", variant="primary") # Test button test_button = gr.Button("Test API Connection", variant="secondary") with gr.Row(): test_output = gr.Textbox(label="Connection Test", interactive=False) # Manual processing for testing process_button.click( fn=process_image, inputs=[instruction_input, video_input, base_url_input], outputs=response_output ) test_button.click( fn=test_api_connection, inputs=base_url_input, outputs=test_output ) if __name__ == "__main__": interface.launch( server_name="localhost", server_port=7860, share=False, debug=True )