ViT-Visualizer / app.py
Felix Konrad
Please please work.
55226e5
import os
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import gradio as gr
from transformers import AutoModel, AutoImageProcessor
from PIL import Image
import torch
os.environ["HF_HUB_OFFLINE"] = "0"
# Global state to store loaded model + processors
state = {
"model_type": None,
"model": None,
"processor": None,
"repo_id": None,
}
def similarity_heatmap(image):
"""
Compute cosine similarity between CLS token and patch tokens
"""
model, processor = state["model"], state["processor"]
inputs = processor(images=image, return_tensors="pt")
pixel_values = inputs["pixel_values"].to(model.device) # shape: (1, 3, H, W)
# get ViT patch size (from model config)
patch_size = model.config.patch_size # usually 16
# Compute patch grid (needed for resizing later)
H_patch = pixel_values.shape[2] // patch_size
W_patch = pixel_values.shape[3] // patch_size
with torch.no_grad():
outputs = model(pixel_values) # last_hidden_state: (1, seq_len, hidden_dim)
last_hidden_state = outputs.last_hidden_state
cls_token = last_hidden_state[:, 0, :] # shape: (1, hidden_dim)
patch_tokens = last_hidden_state[:, 1:, :] # shape: (1, num_patches, hidden_dim)
cls_norm = cls_token / cls_token.norm(dim=-1, keepdim=True)
patch_norm = patch_tokens / patch_tokens.norm(dim=-1, keepdim=True)
cos_sim = torch.einsum("bd,bpd->bp", cls_norm, patch_norm) # shape: (1, num_patches)
cos_sim = cos_sim.reshape((H_patch, W_patch))
return np.array(cos_sim)
def overlay_cosine_grid_on_image(cos_grid: np.ndarray, image: Image.Image, alpha=0.5, colormap="viridis"):
"""
cos_grid: (H_patch, W_patch) numpy array of cosine similarities
image: PIL.Image
alpha: blending factor
colormap: matplotlib colormap name
"""
# Normalize cosine values to [0, 1] for colormap
norm_grid = (cos_grid - cos_grid.min()) / (cos_grid.max() - cos_grid.min() + 1e-8)
# Apply colormap
cmap = cm.get_cmap(colormap)
heatmap_rgba = cmap(norm_grid) # shape: (H_patch, W_patch, 4)
# Convert to RGB 0-255
heatmap_rgb = (heatmap_rgba[:, :, :3] * 255).astype(np.uint8)
heatmap_img = Image.fromarray(heatmap_rgb)
# Resize heatmap to match original image size
heatmap_resized = heatmap_img.resize(image.size, resample=Image.BILINEAR)
# Blend with original image
blended = Image.blend(image.convert("RGBA"), heatmap_resized.convert("RGBA"), alpha=alpha)
return blended
def load_model(repo_id: str, revision: str = None):
"""
Load a Hugging Face model + processor from Hub.
Works with any public repo_id.
"""
try:
# Clean up inputs
repo_id = repo_id.strip()
if not repo_id:
return "Please enter a model repo ID"
if revision and revision.strip() == "":
revision = None
# First try without cache_dir to avoid permission issues
try:
model = AutoModel.from_pretrained(
repo_id,
revision=revision,
trust_remote_code=True,
use_auth_token=False # Explicitly no auth for public models
)
processor = AutoImageProcessor.from_pretrained(
repo_id,
revision=revision,
trust_remote_code=True,
use_auth_token=False
)
except Exception as e1:
# If that fails, try with explicit cache directory
model = AutoModel.from_pretrained(
repo_id,
revision=revision,
cache_dir="/tmp/model_cache", # Use /tmp for better permissions
trust_remote_code=True,
use_auth_token=False,
local_files_only=False # Ensure we can download
)
processor = AutoImageProcessor.from_pretrained(
repo_id,
revision=revision,
cache_dir="/tmp/model_cache",
trust_remote_code=True,
use_auth_token=False,
local_files_only=False
)
# Move to appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
# Validate it's a Vision Transformer
if not hasattr(model.config, 'patch_size'):
return f"Model '{repo_id}' doesn't appear to be a Vision Transformer (no patch_size in config)"
# Update global state
state["model"] = model
state["processor"] = processor
state["repo_id"] = repo_id
state["model_type"] = "custom"
patch_size = model.config.patch_size
return f"Successfully loaded ViT model '{repo_id}' (patch size: {patch_size}) on {device}"
except Exception as e:
error_str = str(e).lower()
if "repository not found" in error_str or "404" in error_str:
return f"Repository '{repo_id}' not found. Please check the repo ID."
elif "connection" in error_str or "network" in error_str or "offline" in error_str:
return f"Network error: {str(e)}"
elif "permission" in error_str or "forbidden" in error_str:
return f"Permission denied. This might be a private repository."
else:
return f"Error loading model: {str(e)}"
def display_image(image: Image):
"""
Simply returns the uploaded image.
"""
return image
def visualize_cosine_heatmap(image: Image):
"""
Generate and overlay cosine similarity heatmap on the input image.
"""
if state["model"] is None:
return None # Return None if no model is loaded
try:
cos_grid = similarity_heatmap(image)
blended = overlay_cosine_grid_on_image(cos_grid, image)
return blended
except Exception as e:
print(f"Error generating heatmap: {e}")
return None
# Gradio UI
with gr.Blocks(title="ViT CLS Visualizer") as demo:
gr.Markdown("# ViT CLS-Visualizer")
gr.Markdown(
"Enter the Hugging Face model repo ID (must be public), upload an image, "
"and visualize the cosine similarity between the CLS token and patches."
)
gr.Markdown("### Popular Vision Transformer models to try:")
gr.Markdown(
"- `google/vit-base-patch16-224`\n"
"- `facebook/deit-base-distilled-patch16-224`\n"
"- `microsoft/dit-base`"
)
with gr.Row():
repo_input = gr.Textbox(
label="Hugging Face Model Repo ID",
placeholder="e.g. google/vit-base-patch16-224",
value="google/vit-base-patch16-224"
)
revision_input = gr.Textbox(
label="Revision (optional)",
placeholder="branch, tag, or commit hash"
)
load_btn = gr.Button("Load Model", variant="primary")
load_status = gr.Textbox(label="Model Status", interactive=False)
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="Upload Image")
image_output = gr.Image(label="Uploaded Image")
with gr.Column():
compute_btn = gr.Button("Compute Heatmap", variant="primary")
heatmap_output = gr.Image(label="Cosine Similarity Heatmap")
# Events
load_btn.click(
fn=load_model,
inputs=[repo_input, revision_input],
outputs=load_status
)
image_input.change(
fn=display_image,
inputs=image_input,
outputs=image_output
)
compute_btn.click(
fn=visualize_cosine_heatmap,
inputs=image_input,
outputs=heatmap_output
)
if __name__ == "__main__":
demo.launch()