inference without image
Does this model support generating outputs without any images? I tried setting the image input parameter to None in the following code, but it doesn’t seem to work.
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
model_id = "llava-hf/llava-interleave-qwen-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(0)
processor = AutoProcessor.from_pretrained(model_id)
# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image")
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "What are these?"},
{"type": "image"},
],
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(prompt, None, return_tensors='pt').to(0, torch.float16)
output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
generated_ids = [output_id[len(input_id):] for input_id, output_id in zip(inputs.input_ids, output)]
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)
Hey! If you're doing inference w/o images, in your template conversation you should remove the {"type": "image"},
. Otherwise the model will expect as many images as there were {"type": "image"}
fields in the conversation
It seems that the error still occurred after removing {"type": "image"}
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
model_id = "llava-hf/llava-interleave-qwen-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(0)
processor = AutoProcessor.from_pretrained(model_id)
# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image")
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "What are these?"}
]
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(prompt, None, return_tensors='pt').to(0, torch.float16)
output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
generated_ids = [output_id[len(input_id):] for input_id, output_id in zip(inputs.input_ids, output)]
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)
Traceback (most recent call last):
File "/data2/zhuang/test.py", line 33, in <module>
inputs = processor(prompt, return_tensors='pt').to(0, torch.float16)
File "/data2/zhuang/transformers/src/transformers/feature_extraction_utils.py", line 230, in to
if torch.is_floating_point(v):
TypeError: is_floating_point(): argument 'input' (position 1) must be Tensor, not NoneType
Ah, I see what's the issue. Seems like Llava was't used before for text-only setting or was used only on "cpu". I will make a PR to fix it soon, imo we can support text-only use-cases similar to other VLMs in the library.
You can also pop "pixel_values" to test it before the fix is made, by adding
inputs = processor(prompt, None, return_tensors='pt')
inputs.pop("pixel_values") # it is set to None, so we should remove it
inputs = inputs.to(0) # and now we can safely move tensors to cuda, no need for bf16 as we have no floating tensors anymore
Thanks !!