import gradio as gr import torch from transformers import AutoModelForCausalLM from PIL import Image import numpy as np from io import BytesIO import spaces # Initialize model globally model = None def load_model(): global model if model is None: model = AutoModelForCausalLM.from_pretrained( "moondream/moondream3-preview", trust_remote_code=True, dtype=torch.bfloat16, device_map={"": "cuda"}, ) model.compile() return model @spaces.GPU(duration=120) def process_image(image, task, question, caption_length, object_query, reasoning, temperature, top_p, max_tokens): model = load_model() settings = { "temperature": temperature, "top_p": top_p, "max_tokens": max_tokens } results = [] if task == "Query": if image is not None: result = model.query( image=Image.fromarray(image), question=question, reasoning=reasoning, settings=settings ) return result["answer"], None, None else: result = model.query( question=question, reasoning=reasoning, settings=settings ) return result["answer"], None, None elif task == "Caption": if image is None: return "Please upload an image for captioning", None, None result = model.caption( Image.fromarray(image), length=caption_length.lower(), settings=settings ) return result["caption"], None, None elif task == "Point": if image is None: return "Please upload an image for point detection", None, None result = model.point(Image.fromarray(image), object_query) # Visualize points on image img_with_points = image.copy() h, w = img_with_points.shape[:2] points_text = "Points found:\n" for i, point in enumerate(result.get("points", [])): x = int(point['x'] * w) y = int(point['y'] * h) # Draw a red circle at each point cv2_available = False try: import cv2 cv2.circle(img_with_points, (x, y), 10, (255, 0, 0), -1) cv2_available = True except: # Fallback to numpy if cv2 not available for dx in range(-5, 6): for dy in range(-5, 6): if dx*dx + dy*dy <= 25: # Circle with radius 5 px, py = x + dx, y + dy if 0 <= px < w and 0 <= py < h: img_with_points[py, px] = [255, 0, 0] points_text += f"Point {i+1}: x={point['x']:.3f}, y={point['y']:.3f}\n" return points_text, img_with_points, None elif task == "Detect": if image is None: return "Please upload an image for object detection", None, None detect_settings = settings.copy() detect_settings["max_objects"] = 10 result = model.detect(Image.fromarray(image), object_query, settings=detect_settings) # Visualize bounding boxes img_with_boxes = image.copy() h, w = img_with_boxes.shape[:2] boxes_text = "Objects detected:\n" for i, obj in enumerate(result.get("objects", [])): x_min = int(obj['x_min'] * w) y_min = int(obj['y_min'] * h) x_max = int(obj['x_max'] * w) y_max = int(obj['y_max'] * h) # Draw bounding box thickness = 3 # Top and bottom borders img_with_boxes[y_min:y_min+thickness, x_min:x_max] = [0, 255, 0] img_with_boxes[y_max-thickness:y_max, x_min:x_max] = [0, 255, 0] # Left and right borders img_with_boxes[y_min:y_max, x_min:x_min+thickness] = [0, 255, 0] img_with_boxes[y_min:y_max, x_max-thickness:x_max] = [0, 255, 0] boxes_text += f"Object {i+1}: x_min={obj['x_min']:.3f}, y_min={obj['y_min']:.3f}, x_max={obj['x_max']:.3f}, y_max={obj['y_max']:.3f}\n" return boxes_text, None, img_with_boxes with gr.Blocks(title="Moondream 3 Preview", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🌙 Moondream 3 Preview - Vision Language Model Experience the power of Moondream 3, a state-of-the-art vision language model with mixture-of-experts architecture. This demo showcases all four skills: Query, Caption, Point, and Detect. [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """ ) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(label="Upload Image (optional for Query)", type="numpy") task_type = gr.Radio( choices=["Query", "Caption", "Point", "Detect"], value="Query", label="Select Task" ) with gr.Column(visible=True) as query_options: question_input = gr.Textbox( label="Question", placeholder="Ask anything about the image or enter a text-only question", lines=2 ) reasoning_toggle = gr.Checkbox( label="Enable Reasoning (better for complex questions)", value=True ) with gr.Column(visible=False) as caption_options: caption_length = gr.Radio( choices=["Short", "Normal", "Long"], value="Normal", label="Caption Length" ) with gr.Column(visible=False) as point_detect_options: object_query_input = gr.Textbox( label="Object to Find", placeholder="e.g., 'person wearing red shirt', 'car', 'dog'", lines=1 ) gr.Markdown("### Advanced Settings") with gr.Accordion("Generation Parameters", open=False): temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p" ) max_tokens = gr.Slider( minimum=50, maximum=2048, value=512, step=50, label="Max Tokens" ) submit_btn = gr.Button("🚀 Process", variant="primary") with gr.Column(scale=1): output_text = gr.Textbox( label="Output", lines=10, show_copy_button=True ) output_image_points = gr.Image( label="Visualization (Points)", visible=False ) output_image_boxes = gr.Image( label="Visualization (Bounding Boxes)", visible=False ) def update_interface(task): return { query_options: gr.Column(visible=(task == "Query")), caption_options: gr.Column(visible=(task == "Caption")), point_detect_options: gr.Column(visible=(task in ["Point", "Detect"])), output_image_points: gr.Image(visible=False), output_image_boxes: gr.Image(visible=False) } def process_and_update_visibility(image, task, question, caption_length, object_query, reasoning, temperature, top_p, max_tokens): text_output, points_img, boxes_img = process_image( image, task, question, caption_length, object_query, reasoning, temperature, top_p, max_tokens ) return { output_text: text_output, output_image_points: gr.Image(value=points_img, visible=(points_img is not None)), output_image_boxes: gr.Image(value=boxes_img, visible=(boxes_img is not None)) } task_type.change( update_interface, inputs=[task_type], outputs=[query_options, caption_options, point_detect_options, output_image_points, output_image_boxes] ) submit_btn.click( process_and_update_visibility, inputs=[ input_image, task_type, question_input, caption_length, object_query_input, reasoning_toggle, temperature, top_p, max_tokens ], outputs=[output_text, output_image_points, output_image_boxes] ) gr.Examples( examples=[ [None, "Query", "Explain the concept of neural networks", "Normal", "", True, 0.7, 0.95, 512], [None, "Query", "What is the capital of France?", "Normal", "", False, 0.3, 0.95, 256], ], inputs=[ input_image, task_type, question_input, caption_length, object_query_input, reasoning_toggle, temperature, top_p, max_tokens ], label="Example Queries" ) gr.Markdown( """ ### About Moondream 3 - **Architecture**: 9B total parameters, 2B active, with mixture-of-experts - **Skills**: Query (Q&A), Caption, Point detection, Object detection - **Features**: 32K context length, multi-crop high resolution processing - **Model**: [moondream/moondream3-preview](https://huggingface.co/moondream/moondream3-preview) ### Tips: - **Query**: Ask open-ended questions about images or use for text-only tasks - **Caption**: Generate short, normal, or long descriptions of images - **Point**: Find specific objects and get their coordinates - **Detect**: Get bounding boxes for objects in images - Enable reasoning for complex visual understanding tasks """ ) demo.launch()