LLaVA-Med Inference produce... "Mathematics"?
Hi,
Recently I've tried to write an inference script to use this model. Not sure if I'm doing anything wrong, but somehow the output of this model is... all-maths. I clone the repository from https://github.com/microsoft/LLaVA-Med/tree/main, get the parameters in this huggingface repo, but still not work.
Here's my code for anyone to get a closer look:
from llava.model.builder import load_pretrained_model
tokenizer, model, image_processor, context_len = load_pretrained_model(
'microsoft/llava-med-v1.5-mistral-7b',
None,
'llava-med-v1.5-mistral-7b')
from PIL import Image
# Load and process the image
image = Image.open("test-image/test_image.png").convert("RGB")
image_tensor = image_processor(image)
import torch
# Assuming 'image_tensor' is the output from the image_processor
# Extract the NumPy array from the 'pixel_values' key
numpy_array = image_tensor['pixel_values'][0]
# Convert the NumPy array to a PyTorch tensor
tensor_image = torch.from_numpy(numpy_array)
# If required, move the tensor to the appropriate device (e.g., GPU) and adjust data type
tensor_image = tensor_image.to(device='cuda')
# If the tokenizer has a pad_token_id, use it
if tokenizer.pad_token_id is not None:
pad_token_id = tokenizer.pad_token_id
else:
# Otherwise, set it to the eos_token_id
pad_token_id = tokenizer.eos_token_id
# Assign the pad_token_id to the model's generation configuration
model.config.pad_token_id = pad_token_id
# Prepare the input prompt
query = "Where is the lesion located?"
prompt = f"<im_start>user <image><im_end>\n{query}"
# Tokenize the input text with padding and return attention mask
inputs = tokenizer(
prompt,
padding=True, # Ensures the input is padded to the longest sequence
return_tensors='pt', # Returns PyTorch tensors
return_attention_mask=True # Includes the attention mask in the output
)
# Extract input_ids and attention_mask
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
input_ids = input_ids.unsqueeze(0).to(model.device)
tensor_image = tensor_image.unsqueeze(0).to(model.device)
# Generate the response
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=tensor_image,
do_sample=True,
temperature=0.7,
top_p=0.7,
max_new_tokens=1024
)
# Decode and return the response
responses = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
responses
The output is:
>>> 'Question: Let l(k) = -11*k**2 - 15*k - 11. Let j(c) = 5*c**2 + 7*c + 6. Let r(p) = 11*j(p) - 6*l(p). Factor r(s).\nAnswer: 5*s*(s + 1)'
Don't know what's wrong with the checkpoint (or me just being silly).
Have you been able to resolve this?
@Boyxavi Try this one:
!pip install --upgrade transformers==4.37.2
!git clone https://github.com/microsoft/LLaVA-Med.git
%cd LLaVA-Med
!git clone https://huggingface.co/liuhaotian/llava-v1.5-13b
from llava.model.builder import load_pretrained_model
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.mm_utils import tokenizer_image_token, process_images
import torch
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path='microsoft/llava-med-v1.5-mistral-7b',
model_base=None,
model_name='llava-med-v1.5-mistral-7b')
from PIL import Image
image = Image.open("test.jpeg").convert("RGB")
image_tensor = process_images([image], image_processor, model.config)[0]
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates
conv_mode = "vicuna_v1"
conv = conv_templates[conv_mode].copy()
roles = conv.roles
# If the tokenizer has a pad_token_id, use it
if tokenizer.pad_token_id is not None:
pad_token_id = tokenizer.pad_token_id
else:
# Otherwise, set it to the eos_token_id
pad_token_id = tokenizer.eos_token_id
# Assign the pad_token_id to the model's generation configuration
model.config.pad_token_id = pad_token_id
# Prepare the input prompt
inp = "What is shown in this image?\n<image>"
inp = inp.replace(DEFAULT_IMAGE_TOKEN, '').strip()
inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
# Generate the response
with torch.inference_mode():
output_ids = model.generate(
input_ids,
# attention_mask=attention_mask.cuda(),
images=image_tensor.unsqueeze(0).half().cuda(),
do_sample=True,
temperature=0.2,
num_beams=1,
max_new_tokens=1024
)
# Decode and return the response
responses = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
responses
Apologize if the code is messy, but it's currently operating in my project and... yeah, can't do anything about that. This script runs and outputs the same as expected in the paper output, you can try it yourself.
TypeError: LlavaMistralForCausalLM.forward() got an unexpected keyword argument 'cache_position'
I got this
I just tried it again and it worked. Thanks!. Though it didn't work on the one i finetuned. Thanks anyway!!!