Demo inference code running error

#23
by DrNicefellow - opened

Loading checkpoint shards: 100%
 3/3 [00:12<00:00,  3.74s/it]

--- IMAGE PROCESSING ---
>>> Prompt
<|user|><|image_1|>What is shown in this image?<|end|><|assistant|>

RuntimeError Traceback (most recent call last)
Cell In[7], line 43
40 inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
42 # Generate response
---> 43 generate_ids = model.generate(
44 **inputs,
45 max_new_tokens=1000,
46 generation_config=generation_config,
47 )
48 generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
49 response = processor.batch_decode(
50 generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
51 )[0]

File ~/anaconda3/lib/python3.11/site-packages/torch/utils/_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/transformers/generation/utils.py:2252, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
2244 input_ids, model_kwargs = self._expand_inputs_for_generation(
2245 input_ids=input_ids,
2246 expand_size=generation_config.num_return_sequences,
2247 is_encoder_decoder=self.config.is_encoder_decoder,
2248 **model_kwargs,
2249 )
2251 # 12. run sample (it degenerates to greedy search when generation_config.do_sample=False)
-> 2252 result = self._sample(
2253 input_ids,
2254 logits_processor=prepared_logits_processor,
2255 stopping_criteria=prepared_stopping_criteria,
2256 generation_config=generation_config,
2257 synced_gpus=synced_gpus,
2258 streamer=streamer,
2259 **model_kwargs,
2260 )
2262 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2263 # 11. prepare beam search scorer
2264 beam_scorer = BeamSearchScorer(
2265 batch_size=batch_size,
2266 num_beams=generation_config.num_beams,
(...)
2271 max_length=generation_config.max_length,
2272 )

File ~/anaconda3/lib/python3.11/site-packages/transformers/generation/utils.py:3251, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3248 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
3250 if is_prefill:
-> 3251 outputs = self(**model_inputs, return_dict=True)
3252 is_prefill = False
3253 else:

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:2116, in Phi4MMForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, input_image_embeds, image_sizes, image_attention_mask, input_audio_embeds, audio_embed_sizes, audio_attention_mask, input_mode, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, num_logits_to_keep)
2113 raise ValueError(f"Invalid input_mode: {input_mode}")
2115 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 2116 outputs = self.model(
2117 input_ids=input_ids,
2118 attention_mask=attention_mask,
2119 position_ids=position_ids,
2120 past_key_values=past_key_values,
2121 inputs_embeds=inputs_embeds,
2122 input_image_embeds=input_image_embeds,
2123 image_sizes=image_sizes,
2124 image_attention_mask=image_attention_mask,
2125 input_audio_embeds=input_audio_embeds,
2126 audio_embed_sizes=audio_embed_sizes,
2127 audio_attention_mask=audio_attention_mask,
2128 audio_projection_mode=audio_projection_mode,
2129 use_cache=use_cache,
2130 output_attentions=output_attentions,
2131 output_hidden_states=output_hidden_states,
2132 return_dict=return_dict,
2133 )
2135 hidden_states = outputs[0]
2136 # Only compute necessary logits, and do not upcast them to float if we are not computing the loss

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:1707, in Phi4MMModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, input_image_embeds, image_sizes, image_attention_mask, input_audio_embeds, audio_embed_sizes, audio_attention_mask, audio_projection_mode, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, **kwargs)
1700 logger.warning_once(
1701 "We detected that you are passing past_key_values as a tuple of tuples. This is deprecated and "
1702 "will be removed in v4.47. Please convert your cache or use an appropriate Cache class "
1703 "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
1704 )
1706 if inputs_embeds is None:
-> 1707 inputs_embeds = self.embed_tokens_extend(
1708 input_ids=input_ids,
1709 input_embeds=inputs_embeds,
1710 input_image_embeds=input_image_embeds,
1711 input_audio_embeds=input_audio_embeds,
1712 image_sizes=image_sizes,
1713 image_attention_mask=image_attention_mask,
1714 audio_embed_sizes=audio_embed_sizes,
1715 audio_attention_mask=audio_attention_mask,
1716 audio_projection_mode=audio_projection_mode,
1717 wte=self.embed_tokens,
1718 )
1720 if cache_position is None:
1721 past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:769, in Phi4MMImageAudioEmbedding.forward(self, input_ids, input_embeds, input_image_embeds, input_audio_embeds, image_sizes, image_attention_mask, audio_embed_sizes, audio_attention_mask, audio_projection_mode, wte)
766 assert input_image_embeds is not None or input_audio_embeds is not None
768 if input_image_embeds is not None:
--> 769 image_hidden_states = self.image_embed(
770 input_ids=input_ids,
771 input_embeds=input_image_embeds,
772 image_sizes=image_sizes,
773 wte=wte,
774 image_attention_mask=image_attention_mask
775 )
776 if input_audio_embeds is not None:
777 audio_hidden_states = self.audio_embed(
778 input_ids=input_ids,
779 input_embeds=input_audio_embeds,
(...)
783 audio_projection_mode=audio_projection_mode,
784 )

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:328, in Phi4MMImageEmbedding.forward(self, input_ids, input_embeds, image_sizes, **kwargs)
326 # Nx(HW)xC
327 if image_attention_mask is not None and len(image_attention_mask) > 0:
--> 328 img_features = self.get_img_features(img_embeds.flatten(0, 1), attention_mask=image_attention_mask.type(torch.BoolTensor).flatten(0,1).to(target_device))
329 else:
330 img_features = self.get_img_features(img_embeds.flatten(0, 1))

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:194, in Phi4MMImageEmbedding.get_img_features(self, img_embeds, attention_mask)
192 else:
193 if attention_mask is not None:
--> 194 img_processor_output = self.img_processor(img_embeds, output_hidden_states=True, patch_attention_mask=attention_mask)
195 else:
196 img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/vision_siglip_navit.py:1370, in SiglipVisionTransformer.forward(self, pixel_values, patch_attention_mask, output_attentions, output_hidden_states, return_dict)
1359 if patch_attention_mask is None:
1360 patch_attention_mask = torch.ones(
1361 size=(
1362 batch_size,
(...)
1367 device=pixel_values.device,
1368 )
-> 1370 hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
1372 patch_attention_mask = patch_attention_mask.view(batch_size, -1)
1373 # The call to _upad_input in _flash_attention_forward is expensive
1374 # So when the patch_attention_mask is full of 1s (i.e. attending to the whole sequence),
1375 # avoiding passing the attention_mask, which is equivalent to attending to the full sequence

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/vision_siglip_navit.py:599, in SiglipVisionEmbeddings.forward(self, pixel_values, patch_attention_mask)
596 bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
598 pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
--> 599 position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
601 position_ids = position_ids.to(self.position_embedding.weight.device)
603 embeddings = embeddings + self.position_embedding(position_ids)

RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [992]

@DrNicefellow Did you find a solution to this problem? I am also facing the same error

@PrateekTikku Nah, not gonna waste time for that kinda thing

nguyenbh changed discussion status to closed

If you have inference errors, please double check the environ and dependencies, this is what we suggest
https://huggingface.co/microsoft/Phi-4-multimodal-instruct#requirements

Sign up or log in to comment