vikhyatk/moondream2 · Inference Fails with Type Mismatch Error: self and mat2 must have the same dtype, but got Half and Char

Hi team,

I'm encountering an error during inference using the moondream2 model (vikhyatk/moondream2) on Jetson Orin Nano. The model loads correctly, but when running inference on images, I consistently get the following error:

Inference result: Error processing image: self and mat2 must have the same dtype, but got Half and Char

Environment:

Device: Jetson Orin Nano
OS: Ubuntu 22.04
Python Version: 3.10
Transformers version: 4.48.3
torch: Jetson-compatible version (from NVIDIA wheels[ torch-2.6.0-cp310-cp310-linux_aarch64.whl
torchaudio-2.6.0-cp310-cp310-linux_aarch64.whl
torchvision-0.21.0-cp310-cp310-linux_aarch64.whl
])
Quantization: BitsAndBytesConfig(load_in_8bit=True)
Code to load model:

self.model = AutoModelForCausalLM.from_pretrained(
    "vikhyatk/moondream2",
    quantization_config=quantization_config,
    revision="2025-01-09",
    trust_remote_code=True,
    device_map={"": "cuda"}
)

Reproduction:

When running this snippet:

from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from PIL import Image
import torch
import time
import cv2
import gc
import os
import csv

class VLMInference:
    def __init__(self):
        """Initialize the VLMInference class without loading the model by default"""
        self.model = None  # Model is not loaded initially
        
    def load_model(self):
        """
        Load the Moondream model from pretrained weights and set it to eval mode
        
        Returns:
            model: Loaded Moondream model on CUDA
        """
        if self.model is not None:
            print("Model is already loaded.")
            return self.model
        
        try:
            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
            self.model = AutoModelForCausalLM.from_pretrained(
                "vikhyatk/moondream2",
                quantization_config=quantization_config,  # Use the config here
                revision="2025-01-09",
                trust_remote_code=True,
                device_map={"": "cuda"}
                
            )
            self.model.eval()  # Set model to evaluation mode after loading
            print("Model loaded successfully.")
            return self.model
        except Exception as e:
            raise RuntimeError(f"Failed to load model: {str(e)}")

    
    
    def unload_model(self):
        """
        Unload the model from memory and GPU
        """
        if self.model is None:
            print("No model is currently loaded.")
            return
        
        try:
            del self.model  # Delete the model object
            self.model = None  # Set to None to indicate it's unloaded
            torch.cuda.empty_cache()  # Clear GPU memory
            print("Model unloaded successfully.")
        except Exception as e:
            print(f"Error unloading model: {str(e)}")
    
    def run_moondream(self, image, prompt="how many trucks are there"):
        """
        Run inference on the provided image with the given prompt
        
        Args:
            image: Input image (expected to be in a format compatible with the model)
            prompt: Text prompt for the model (default: "how many trucks are there")
            
        Returns:
            str: Model response or error message
        """
        if self.model is None:
            return "Error: Model is not loaded. Please call load_model() first."
        
        try:
            response = self.model.query(image, prompt)
            return response
        except Exception as e:
            return f"Error processing image: {str(e)}"

    def point(self, image, prompt):
        """
        Detect objects in the image based on the provided prompt and print the count and pixel coordinates.
        
        Args:
            image: Input image
            prompt: Object category to detect (e.g., "person" or "car")
        """
        if self.model is None:
            print("Error: Model is not loaded. Please call load_model() first.")
            return []
        
        try:
            points = self.model.point(image, prompt)["points"]
            print(f"Found {len(points)} {prompt}(s)")

            image_width, image_height = image.size  # Get image dimensions

            for i, point in enumerate(points):
                if isinstance(point, dict) and 'x' in point and 'y' in point:
                    x_pixel = int(point['x'] * image_width)
                    y_pixel = int(point['y'] * image_height)
                    print(f"{prompt} {i+1}: ({x_pixel}, {y_pixel})")  # Now in pixel coordinates
                else:
                    print(f"Unexpected point format for {prompt} {i+1}: {point}")

            return points
        except Exception as e:
            print(f"Error detecting {prompt}: {str(e)}")
            return []

def print_gpu_memory():
    """Print GPU memory usage"""
    print(f"Allocated Memory: {torch.cuda.memory_allocated() / 1e6} MB")
    print(f"Reserved Memory: {torch.cuda.memory_reserved() / 1e6} MB")

def process_images_in_folder(folder_path, prompt, output_csv="timing_results.csv"):
    """
    Process all images in a folder and log timing information, including the prompt, to a CSV file.
    
    Args:
        folder_path: Path to the folder containing images
        prompt: Text prompt to use for inference (default: "how many trucks are there")
        output_csv: Path to the output CSV file
    """
    vlm = VLMInference()
    
    # Supported image extensions
    image_extensions = (".jpg", ".jpeg", ".png")
    
    # Open CSV file for writing
    with open(output_csv, mode='w', newline='') as csv_file:
        fieldnames = ['image_name', 'load_time_s', 'inference_time_s', 'unload_time_s', 'prompt']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        
        # Iterate over all files in the folder
        for filename in os.listdir(folder_path):
            if filename.lower().endswith(image_extensions):
                image_path = os.path.join(folder_path, filename)
                print(f"\nProcessing {filename}...")
                
                # Load image
                image = Image.open(image_path).resize((480, 480))
                
                # Measure load time
                start_time = time.time()
                vlm.load_model()
                load_time = time.time() - start_time
                
                # Measure inference time with the specified prompt
                start_time = time.time()
                result = vlm.run_moondream(image, prompt)
                inference_time = time.time() - start_time
                print(f"Inference result: {result}")
                
                # Measure unload time
                start_time = time.time()
                vlm.unload_model()
                unload_time = time.time() - start_time
                
                # Log to CSV
                writer.writerow({
                    'image_name': filename,
                    'load_time_s': load_time,
                    'inference_time_s': inference_time,
                    'unload_time_s': unload_time,
                    'prompt': prompt
                })
                
                # Print timing info
                print(f"Load time: {load_time:.3f}s")
                print(f"Inference time: {inference_time:.3f}s")
                print(f"Unload time: {unload_time:.3f}s")
                print_gpu_memory()
                
                # Optional: Small delay between iterations
                time.sleep(1)

if __name__ == "__main__":
    try:
        # Specify the folder containing images
        image_folder = "data"
        
        # Define the prompt (you can change this as needed)
        inference_prompt = "describe the image"
        
        # Process all images and log results
        process_images_in_folder(image_folder, prompt=inference_prompt, output_csv="int8.csv")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

The error message appears. Full output:

Processing 100000046.jpg...
Model loaded successfully.
Inference result: Error processing image: self and mat2 must have the same dtype, but got Half and Char
Model unloaded successfully.

Request:

Could you please look into:

Why the dtype mismatch is happening?
Whether moondream2 supports inference in 8-bit or half precision on Jetson Orin Nano?
Any known workaround (e.g., forcing float32 or proper dtype conversion for the image input)?

Thank you for your amazing work on this model. Looking forward to your help to resolve this issue.

Best regards,
Abdul Manaf PV