I am trying the last few hours to make this working. Is there any working script that somebody has?
#70
by
JLouisBiz
- opened
I'm still struggling with implementing it for about a day now. Have you seen anyone else doing something similar or have they shared their solution already?
DeepSeek helped:
from PIL import Image
import requests
from transformers import AutoModelForCausalLM, CLIPImageProcessor, LlamaTokenizerFast
from io import BytesIO
import torch
import os
# Disable all problematic attention implementations
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
MODEL_PATH = "/mnt/nvme0n1/LLM/Microsoft/Phi-3-vision-128k-instruct"
IMAGE_URL = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"
def load_components():
print("Loading components with stable configuration...")
# Load tokenizer
tokenizer = LlamaTokenizerFast.from_pretrained(
MODEL_PATH,
trust_remote_code=True
)
# Load image processor
image_processor = CLIPImageProcessor.from_pretrained(
MODEL_PATH,
trust_remote_code=True
)
# Load model with stable settings
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.float16
)
# Force disable problematic attention mechanisms
model.config.use_cache = False
return tokenizer, image_processor, model.eval()
def generate_response():
try:
tokenizer, image_processor, model = load_components()
# 1. Prepare conversation
messages = [
{"role": "user", "content": "<|image_1|>\nDescribe this image in detail."},
{"role": "assistant", "content": ""}
]
# 2. Download and process image
response = requests.get(IMAGE_URL)
image = Image.open(BytesIO(response.content))
if image.mode != 'RGB':
image = image.convert('RGB')
# 3. Prepare inputs
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(
prompt,
return_tensors="pt",
padding="max_length",
max_length=1024,
truncation=True
)
inputs['pixel_values'] = image_processor(
images=image,
return_tensors="pt"
).pixel_values
# 4. Move to device
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# 5. Generate response
generation_config = {
"max_new_tokens": 500,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
"eos_token_id": tokenizer.eos_token_id,
"pad_token_id": tokenizer.eos_token_id,
"use_cache": False # Critical for Phi-3-Vision
}
outputs = model.generate(
**inputs,
**generation_config
)
# 6. Decode and print
response = tokenizer.decode(
outputs[0][inputs['input_ids'].shape[1]:],
skip_special_tokens=True
)
print("\n=== IMAGE DESCRIPTION ===")
print(response)
except Exception as e:
print(f"Error: {str(e)}")
finally:
torch.cuda.empty_cache()
if __name__ == "__main__":
generate_response()
That works, just it hallucinates about the picture. It speaks of birds and bicycles where there is just a graph.