microsoft/Phi-3-vision-128k-instruct · I am trying the last few hours to make this working. Is there any working script that somebody has?

DeepSeek helped:
from PIL import Image
import requests
from transformers import AutoModelForCausalLM, CLIPImageProcessor, LlamaTokenizerFast
from io import BytesIO
import torch
import os

# Disable all problematic attention implementations
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)

MODEL_PATH = "/mnt/nvme0n1/LLM/Microsoft/Phi-3-vision-128k-instruct"
IMAGE_URL = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"

def load_components():
    print("Loading components with stable configuration...")
    
    # Load tokenizer
    tokenizer = LlamaTokenizerFast.from_pretrained(
        MODEL_PATH,
        trust_remote_code=True
    )
    
    # Load image processor
    image_processor = CLIPImageProcessor.from_pretrained(
        MODEL_PATH,
        trust_remote_code=True
    )
    
    # Load model with stable settings
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16
    )
    
    # Force disable problematic attention mechanisms
    model.config.use_cache = False
    return tokenizer, image_processor, model.eval()

def generate_response():
    try:
        tokenizer, image_processor, model = load_components()
        
        # 1. Prepare conversation
        messages = [
            {"role": "user", "content": "<|image_1|>\nDescribe this image in detail."},
            {"role": "assistant", "content": ""}
        ]
        
        # 2. Download and process image
        response = requests.get(IMAGE_URL)
        image = Image.open(BytesIO(response.content))
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # 3. Prepare inputs
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding="max_length",
            max_length=1024,
            truncation=True
        )
        
        inputs['pixel_values'] = image_processor(
            images=image,
            return_tensors="pt"
        ).pixel_values
        
        # 4. Move to device
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        # 5. Generate response
        generation_config = {
            "max_new_tokens": 500,
            "do_sample": True,
            "temperature": 0.7,
            "top_p": 0.9,
            "eos_token_id": tokenizer.eos_token_id,
            "pad_token_id": tokenizer.eos_token_id,
            "use_cache": False  # Critical for Phi-3-Vision
        }
        
        outputs = model.generate(
            **inputs,
            **generation_config
        )
        
        # 6. Decode and print
        response = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )
        
        print("\n=== IMAGE DESCRIPTION ===")
        print(response)
        
    except Exception as e:
        print(f"Error: {str(e)}")
    finally:
        torch.cuda.empty_cache()

if __name__ == "__main__":
    generate_response()
That works, just it hallucinates about the picture. It speaks of birds and bicycles where there is just a graph.