from transformers import AutoModelForVision2Seq, AutoProcessor | |
from peft import PeftModel | |
from PIL import Image | |
import torch | |
#LOAD | |
model = AutoModelForVision2Seq.from_pretrained( | |
"unsloth/llava-1.5-7b-hf-bnb-4bit", | |
device_map="auto", | |
torch_dtype=torch.float16, | |
) | |
model = PeftModel.from_pretrained(model, "grohitraj/archive_classification") | |
model = model.half() # fix dtype mismatch | |
processor = AutoProcessor.from_pretrained("grohitraj/archive_classification") | |
# TEST | |
image = Image.open("example_from_2019ISIC_data.jpg") | |
prompt = "<image>\nDescribe about the image for male aged 54:" | |
inputs = processor(text=prompt, images=image, return_tensors="pt") | |
inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
#OUTPUT | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_new_tokens=100) | |
print(processor.tokenizer.decode(outputs[0], skip_special_tokens=True)) | |