dinov3-viz-sat493m

Running on Zero

App Files Files Community

pszemraj commited on Sep 5

Commit

629aa9f

verified ·

1 Parent(s): f260034

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -209

app.py CHANGED Viewed

@@ -7,296 +7,224 @@ import numpy as np
 import spaces
 import torch
 import torch.nn.functional as F
-from PIL import Image, ImageDraw, ImageOps
 from transformers import AutoImageProcessor, AutoModel
-# --- Configuration ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_MAP = {
-    "DINOv3 ViT-L/16 Satellite": "facebook/dinov3-vitl16-pretrain-sat493m",
-    "DINOv3 ViT-L/16 LVD (General Web)": "facebook/dinov3-vitl16-pretrain-lvd1689m",
-    # "⚠️ DINOv3 ViT-7B/16 Satellite": "facebook/dinov3-vit7b16-pretrain-sat493m", # Uncomment if using a large enough GPU
 }
-DEFAULT_MODEL_NAME = list(MODEL_MAP.keys())[0]
-# --- Global State ---
 processor = None
 model = None
-# --- Core Functions ---
 def cleanup_memory():
-    """Aggressively cleans up memory to prevent OOM errors when switching models."""
     global processor, model
     if model is not None:
         del model
     if processor is not None:
         del processor
-    processor, model = None, None
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-def load_model(name: str):
-    """Loads a specified model and processor, handling memory cleanup."""
     global processor, model
     try:
         cleanup_memory()
         model_id = MODEL_MAP[name]
         processor = AutoImageProcessor.from_pretrained(model_id)
         model = (
-            AutoModel.from_pretrained(model_id, torch_dtype="auto").to(DEVICE).eval()
         )
         param_count = sum(p.numel() for p in model.parameters()) / 1e9
-        dtype_str = str(next(model.parameters()).dtype).split(".")[-1]
-        return f"✅ Loaded: {name} | {param_count:.1f}B params | {dtype_str} | {DEVICE.upper()}"
     except Exception as e:
         cleanup_memory()
-        return f"❌ Failed to load {name}: {str(e)}"
-@spaces.GPU(duration=60)
-def _extract_grid(img: Image.Image):
-    """Extracts a grid of feature vectors from an image."""
-    with torch.inference_mode():
-        inputs = processor(images=img, return_tensors="pt").to(DEVICE)
-        outputs = model(**inputs)
-        last_hidden_state = outputs.last_hidden_state[0].to(torch.float32)
-        # Correctly calculate grid dimensions from model config
-        p = model.config.patch_size
-        h, w = inputs.pixel_values.shape[-2] // p, inputs.pixel_values.shape[-1] // p
-        # Exclude [CLS] token and reshape
-        # DINOv2/v3 models don't have register tokens, but this is safe
-        num_special_tokens = 1
-        features = last_hidden_state[num_special_tokens:, :].reshape(h, w, -1).cpu()
-    return features, h, w
-def _overlay_heatmap(
-    base_img: Image.Image,
-    heatmap_01: np.ndarray,
-    opacity: float,
-    colormap: str,
-    box: tuple = None,
-):
-    """Overlays a heatmap on the base image with a specified colormap and opacity."""
-    h, w = base_img.height, base_img.width
-    # Resize heatmap and apply colormap
-    heatmap = Image.fromarray((heatmap_01 * 255).astype(np.uint8)).resize(
-        (w, h), resample=Image.LANCZOS
-    )
-    cmap_func = cm.get_cmap(colormap.lower())
-    rgba_heatmap = (cmap_func(np.asarray(heatmap) / 255.0) * 255).astype(np.uint8)
-    # Create overlay image with specified opacity
-    overlay = Image.fromarray(rgba_heatmap, "RGBA")
-    overlay.putalpha(int(opacity * 255))
-    # Composite overlay onto a copy of the base image
-    out_img = Image.alpha_composite(base_img.copy().convert("RGBA"), overlay)
-    # Draw selection box if provided
     if box:
-        draw = ImageDraw.Draw(out_img, "RGBA")
-        # Draw a thick white border with a thin black outline for visibility
-        draw.rectangle(box, outline=(255, 255, 255, 255), width=3)
-        draw.rectangle(
-            (box[0] - 1, box[1] - 1, box[2] + 1, box[3] + 1),
-            outline=(0, 0, 0, 200),
-            width=1,
-        )
-    return out_img.convert("RGB")
-# --- Gradio Event Handlers ---
-def prepare(img: Image.Image):
-    """Prepares the image by extracting features and storing them in the state."""
     if img is None:
-        return None, "Upload an image to begin."
-    base_img = ImageOps.exif_transpose(img.convert("RGB"))
-    feats, gh, gw = _extract_grid(base_img)
-    state = {"orig": base_img, "feats": feats, "gh": gh, "gw": gw}
-    return state, "Click on the image to compute similarity."
-def on_click(state: dict, opacity: float, colormap: str, evt: gr.SelectData):
-    """Handles click events to compute and display the similarity heatmap."""
     if not state or evt.index is None:
-        return gr.UNCHANGED, state, "Please upload an image first."
     base, feats, gh, gw = state["orig"], state["feats"], state["gh"], state["gw"]
-    # Calculate patch index from click coordinates
     x, y = evt.index
-    px_w, px_h = base.width / gw, base.height / gh
-    i = min(int(x // px_w), gw - 1)
-    j = min(int(y // px_h), gh - 1)
-    # Compute similarity
     d = feats.shape[-1]
-    query_vec = F.normalize(feats[j, i].reshape(1, d), dim=1)
-    feature_grid = F.normalize(feats.reshape(-1, d), dim=1)
-    sims = (feature_grid @ query_vec.T).reshape(gh, gw).numpy()
-    # Normalize similarity map to [0, 1] for visualization
     smin, smax = float(sims.min()), float(sims.max())
-    heatmap_01 = (sims - smin) / (smax - smin + 1e-12)
-    # Define selection box coordinates
-    box = (int(i * px_w), int(j * px_h), int((i + 1) * px_w), int((j + 1) * px_h))
-    # Generate overlay image
-    output_img = _overlay_heatmap(base, heatmap_01, opacity, colormap, box)
-    # Create statistics string
-    stats = f"""📊 **Similarity Statistics**
-- **Min**: `{smin:.3f}`
-- **Max**: `{smax:.3f}`
-- **Range**: `{smax - smin:.3f}`
-- **Patch Index**: `({i}, {j})`
-- **Grid Size**: `{gw}×{gh}`"""
-    return output_img, state, stats
-def reset_overlay(state: dict):
-    """Resets the image to its original state, removing the heatmap."""
-    if state and "orig" in state:
-        return state["orig"], state, "Overlay reset. Click the image again."
-    return None, None, "Upload an image to begin."
-# --- Gradio UI ---
-with gr.Blocks(
-    theme=gr.themes.Soft(primary_hue="blue"),
-    css=".container {max-width: 1200px; margin: auto;}",
-) as demo:
-    gr.HTML(
-        """
-    <div style="text-align: center; padding: 20px;">
-        <h1>🛰️ DINOv3 Satellite Vision: Interactive Patch Similarity</h1>
-        <p style="font-size: 1.1em; color: #666;">Explore how DINOv3 models trained on satellite imagery understand visual patterns</p>
-    </div>
-    """
-    )
-    with gr.Row():
-        with gr.Column(scale=3):
-            gr.Markdown(
-                """
-            ### How it works
-            1. **Select a model** - Satellite-pretrained models are optimized for aerial/satellite imagery.
-            2. **Upload or select an image** - Works best with satellite, aerial, or outdoor scenes.
-            3. **Click any region** - See how similar other patches are to your selection.
-            4. **Adjust visualization** - Fine-tune opacity and colormap for clarity.
-            """
-            )
-        with gr.Column(scale=2):
-            gr.HTML(
-                """
-            <div style="background: rgba(0,0,0,0.03); border-radius: 8px; padding: 12px; border-left: 4px solid #2563eb;">
-                <b>💡 Model Info:</b><br>
-                • <b>Satellite models</b>: Trained on 493M satellite images.<br>
-                • <b>LVD model</b>: Trained on 1.7B diverse images.<br>
-                • <b>7B model</b>: Massive capacity, slower but more nuanced.
-            </div>
-            """
-            )
-    with gr.Row(variant="panel"):
-        with gr.Column(scale=1):
             model_choice = gr.Dropdown(
-                choices=list(MODEL_MAP.keys()),
-                value=DEFAULT_MODEL_NAME,
-                label="🤖 Model Selection",
             )
             status = gr.Textbox(
-                label="📡 Model Status",
-                value="Loading initial model...",
-                interactive=False,
             )
-            with gr.Accordion("Visualization Controls", open=True):
-                opacity = gr.Slider(
-                    0.2, 0.9, 0.55, step=0.05, label="🎨 Heatmap Opacity"
-                )
-                colormap = gr.Dropdown(
-                    ["Turbo", "Inferno", "Viridis", "Plasma", "Jet"],
-                    value="Turbo",
-                    label="🌈 Colormap",
-                )
-            info_panel = gr.Markdown(
-                value="*Upload an image and click on it to see statistics here.*",
-                label="Statistics",
-            )
-            with gr.Row():
-                reset_btn = gr.Button("🔄 Reset Overlay")
-                # A ClearButton is simpler for clearing multiple components
-                clear_btn = gr.ClearButton(value="🗑️ Clear All")
-        with gr.Column(scale=2):
             img = gr.Image(
-                type="pil",
-                label="Interactive Canvas (Click to explore)",
-                height=600,
-                show_download_button=True,
             )
-    # Define a state object to hold persistent data (original image, features)
     state = gr.State()
-    # NOTE: For Hugging Face Spaces, list file paths explicitly.
-    # Make sure these images are in your repository.
-    gr.Examples(
-        examples=[
-            ["examples/satellite_city.jpg"],
-            ["examples/coastal_area.png"],
-            ["examples/farmland.webp"],
-        ],
-        inputs=[img],
-        outputs=[state, info_panel],
-        fn=prepare,
-        cache_examples=torch.cuda.is_available(),  # Cache on GPU instances
-    )
-    # --- Event Wiring ---
-    demo.load(lambda: load_model(DEFAULT_MODEL_NAME), outputs=status)
-    model_choice.change(
-        load_model, inputs=model_choice, outputs=status, show_progress="full"
-    )
-    img.upload(
-        prepare, inputs=img, outputs=[state, info_panel], show_progress="minimal"
-    )
     img.select(
-        on_click,
-        inputs=[state, opacity, colormap],  # REMOVED `img` from inputs to fix the error
-        outputs=[img, state, info_panel],
         show_progress="minimal",
     )
-    reset_btn.click(reset_overlay, inputs=[state], outputs=[img, state, info_panel])
-    # Wire the clear button to the components it should clear
-    clear_btn.add([img, state, info_panel])
-print(load_model(DEFAULT_MODEL_NAME))
-demo.launch(share=False, show_error=True)

 import spaces
 import torch
 import torch.nn.functional as F
+from PIL import Image, ImageOps
 from transformers import AutoImageProcessor, AutoModel
+# Device configuration with memory management
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_MAP = {
+    "DINOv3 ViT-L/16 Satellite (493M)": "facebook/dinov3-vitl16-pretrain-sat493m",
+    "DINOv3 ViT-L/16 LVD (1.7B)": "facebook/dinov3-vitl16-pretrain-lvd1689m",
+    "⚠️ DINOv3 ViT-7B/16 Satellite": "facebook/dinov3-vit7b16-pretrain-sat493m",
 }
+DEFAULT_NAME = list(MODEL_MAP.keys())[0]
+# Global model state
 processor = None
 model = None
 def cleanup_memory():
+    """Aggressive memory cleanup for model switching"""
     global processor, model
     if model is not None:
         del model
+        model = None
     if processor is not None:
         del processor
+        processor = None
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+def load_model(name):
+    """Load model with proper memory management and dtype handling"""
     global processor, model
     try:
+        # Clean up existing model
         cleanup_memory()
         model_id = MODEL_MAP[name]
+        # Load processor
         processor = AutoImageProcessor.from_pretrained(model_id)
+        # Load model with auto dtype for optimal performance
         model = (
+            AutoModel.from_pretrained(
+                model_id,
+                torch_dtype="auto",
+            )
+            .to(DEVICE)
+            .eval()
         )
+        # Get model info
         param_count = sum(p.numel() for p in model.parameters()) / 1e9
+        return f"Loaded: {name} | {param_count:.1f}B params | {DEVICE.upper()}"
     except Exception as e:
         cleanup_memory()
+        return f"Failed to load {name}: {str(e)}"
+# Initialize default model
+load_model(DEFAULT_NAME)
+@spaces.GPU(duration=60)
+def _extract_grid(img):
+    """Extract feature grid from image"""
+    with torch.inference_mode():
+        pv = processor(images=img, return_tensors="pt").pixel_values
+        if DEVICE == "cuda":
+            pv = pv.to(DEVICE)
+        out = model(pixel_values=pv)
+        last = out.last_hidden_state[0].to(torch.float32)
+        num_reg = getattr(model.config, "num_register_tokens", 0)
+        p = model.config.patch_size
+        _, _, Ht, Wt = pv.shape
+        gh, gw = Ht // p, Wt // p
+        feats = last[1 + num_reg :, :].reshape(gh, gw, -1).cpu()
+    return feats, gh, gw
+def _overlay(orig, heat01, alpha=0.55, box=None):
+    """Create heatmap overlay"""
+    H, W = orig.height, orig.width
+    heat = Image.fromarray((heat01 * 255).astype(np.uint8)).resize((W, H))
+    rgba = (cm.get_cmap("inferno")(np.asarray(heat) / 255.0) * 255).astype(np.uint8)
+    ov = Image.fromarray(rgba, "RGBA")
+    ov.putalpha(int(alpha * 255))
+    base = orig.copy().convert("RGBA")
+    out = Image.alpha_composite(base, ov)
     if box:
+        from PIL import ImageDraw
+        ImageDraw.Draw(out, "RGBA").rectangle(
+            box, outline=(255, 255, 255, 220), width=2
+        )
+    return out
+def prepare(img):
+    """Prepare image and extract features"""
     if img is None:
+        return None
+    base = ImageOps.exif_transpose(img.convert("RGB"))
+    feats, gh, gw = _extract_grid(base)
+    return {"orig": base, "feats": feats, "gh": gh, "gw": gw}
+def click(state, opacity, img_value, evt: gr.SelectData):
+    """Handle click events for similarity visualization"""
+    # If state wasn't prepared (e.g., Example selection), build it now
+    if state is None and img_value is not None:
+        state = prepare(img_value)
     if not state or evt.index is None:
+        # Just show whatever is currently in the image component
+        return img_value, state
     base, feats, gh, gw = state["orig"], state["feats"], state["gh"], state["gw"]
     x, y = evt.index
+    px_x, px_y = base.width / gw, base.height / gh
+    i = min(int(x // px_x), gw - 1)
+    j = min(int(y // px_y), gh - 1)
     d = feats.shape[-1]
+    grid = F.normalize(feats.reshape(-1, d), dim=1)
+    v = F.normalize(feats[j, i].reshape(1, d), dim=1)
+    sims = (grid @ v.T).reshape(gh, gw).numpy()
     smin, smax = float(sims.min()), float(sims.max())
+    heat01 = (sims - smin) / (smax - smin + 1e-12)
+    box = (int(i * px_x), int(j * px_y), int((i + 1) * px_x), int((j + 1) * px_y))
+    overlay = _overlay(base, heat01, alpha=opacity, box=box)
+    return overlay, state
+def reset():
+    """Reset the interface"""
+    return None, None
+with gr.Blocks() as demo:
+    gr.Markdown("## DINOv3: patch similarity visualizer")
+    gr.Markdown(
+        "This is an app where you can upload an image, click on an object in the image and get the most similar patches to it according to DINOv3, revealing the way DINOv3 segments objects through features natively."
+    )
+    gr.Markdown("There's multiple model options you can pick from the dropdown.")
+    gr.Markdown(
+        "Please click Reset before you want to upload another image, as this app keeps features of the images."
+    )
+    with gr.Column():
+        with gr.Row(scale=1):
             model_choice = gr.Dropdown(
+                choices=list(MODEL_MAP.keys()), value=DEFAULT_NAME, label="Model"
             )
             status = gr.Textbox(
+                label="Status", value=f"Loaded: {DEFAULT_NAME}", interactive=False
             )
+            opacity = gr.Slider(0.0, 1.0, 0.55, step=0.05, label="Opacity for the Map")
+        with gr.Row(scale=1):
             img = gr.Image(
+                type="pil", label="Image", interactive=True, height=750, width=750
             )
     state = gr.State()
+    model_choice.change(load_model, inputs=model_choice, outputs=status)
+    img.upload(prepare, inputs=img, outputs=state)
     img.select(
+        click,
+        inputs=[state, opacity, img],
+        outputs=[img, state],
         show_progress="minimal",
     )
+    gr.Button("Reset").click(reset, outputs=[img, state])
+    # Examples from current directory
+    example_files = [
+        str(f)
+        for f in Path.cwd().iterdir()
+        if f.suffix.lower() in [".jpg", ".jpeg", ".png", ".webp"]
+    ]
+    if example_files:
+        gr.Examples(
+            examples=[[f] for f in example_files],
+            inputs=img,
+            fn=prepare,
+            outputs=[state],
+            label="Try an example image and then click on the objects.",
+            examples_per_page=4,
+            cache_examples=False,
+        )
+if __name__ == "__main__":
+    demo.launch()