from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import PeftModel
from PIL import Image
import torch
#LOAD
model = AutoModelForVision2Seq.from_pretrained(
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
    device_map="auto",
    torch_dtype=torch.float16,
)

model = PeftModel.from_pretrained(model, "grohitraj/archive_classification")
model = model.half()  # fix dtype mismatch
processor = AutoProcessor.from_pretrained("grohitraj/archive_classification")

# TEST
image = Image.open("example_from_2019ISIC_data.jpg")
prompt = "<image>\nDescribe about the image for male aged 54:"
inputs = processor(text=prompt, images=image, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

#OUTPUT
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100)
print(processor.tokenizer.decode(outputs[0], skip_special_tokens=True))