microsoft
/

Phi-4-multimodal-instruct

Automatic Speech Recognition

text-generation

speech-summarization

speech-translation

visual-question-answering

phi-4-multimodal

Model card Files Files and versions

Demo inference code running error

#23

by DrNicefellow - opened Mar 3

Mar 3

Loading checkpoint shards: 100%
3/3 [00:12<00:00, 3.74s/it]

--- IMAGE PROCESSING ---
>>> Prompt
<|user|><|image_1|>What is shown in this image?<|end|><|assistant|>

RuntimeError Traceback (most recent call last)
Cell In[7], line 43
40 inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
42 # Generate response
---> 43 generate_ids = model.generate(
44 **inputs,
45 max_new_tokens=1000,
46 generation_config=generation_config,
47 )
48 generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
49 response = processor.batch_decode(
50 generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
51 )[0]

File ~/anaconda3/lib/python3.11/site-packages/torch/utils/_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/transformers/generation/utils.py:2252, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
2244 input_ids, model_kwargs = self._expand_inputs_for_generation(
2245 input_ids=input_ids,
2246 expand_size=generation_config.num_return_sequences,
2247 is_encoder_decoder=self.config.is_encoder_decoder,
2248 **model_kwargs,
2249 )
2251 # 12. run sample (it degenerates to greedy search when generation_config.do_sample=False)
-> 2252 result = self._sample(
2253 input_ids,
2254 logits_processor=prepared_logits_processor,
2255 stopping_criteria=prepared_stopping_criteria,
2256 generation_config=generation_config,
2257 synced_gpus=synced_gpus,
2258 streamer=streamer,
2259 **model_kwargs,
2260 )
2262 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2263 # 11. prepare beam search scorer
2264 beam_scorer = BeamSearchScorer(
2265 batch_size=batch_size,
2266 num_beams=generation_config.num_beams,
(...)
2271 max_length=generation_config.max_length,
2272 )

File ~/anaconda3/lib/python3.11/site-packages/transformers/generation/utils.py:3251, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3248 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
3250 if is_prefill:
-> 3251 outputs = self(**model_inputs, return_dict=True)
3252 is_prefill = False
3253 else:

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:2116, in Phi4MMForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, input_image_embeds, image_sizes, image_attention_mask, input_audio_embeds, audio_embed_sizes, audio_attention_mask, input_mode, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, num_logits_to_keep)
2113 raise ValueError(f"Invalid input_mode: {input_mode}")
2115 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 2116 outputs = self.model(
2117 input_ids=input_ids,
2118 attention_mask=attention_mask,
2119 position_ids=position_ids,
2120 past_key_values=past_key_values,
2121 inputs_embeds=inputs_embeds,
2122 input_image_embeds=input_image_embeds,
2123 image_sizes=image_sizes,
2124 image_attention_mask=image_attention_mask,
2125 input_audio_embeds=input_audio_embeds,
2126 audio_embed_sizes=audio_embed_sizes,
2127 audio_attention_mask=audio_attention_mask,
2128 audio_projection_mode=audio_projection_mode,
2129 use_cache=use_cache,
2130 output_attentions=output_attentions,
2131 output_hidden_states=output_hidden_states,
2132 return_dict=return_dict,
2133 )
2135 hidden_states = outputs[0]
2136 # Only compute necessary logits, and do not upcast them to float if we are not computing the loss

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:1707, in Phi4MMModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, input_image_embeds, image_sizes, image_attention_mask, input_audio_embeds, audio_embed_sizes, audio_attention_mask, audio_projection_mode, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, **kwargs)
1700 logger.warning_once(
1701 "We detected that you are passing past_key_values as a tuple of tuples. This is deprecated and "
1702 "will be removed in v4.47. Please convert your cache or use an appropriate Cache class "
1703 "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
1704 )
1706 if inputs_embeds is None:
-> 1707 inputs_embeds = self.embed_tokens_extend(
1708 input_ids=input_ids,
1709 input_embeds=inputs_embeds,
1710 input_image_embeds=input_image_embeds,
1711 input_audio_embeds=input_audio_embeds,
1712 image_sizes=image_sizes,
1713 image_attention_mask=image_attention_mask,
1714 audio_embed_sizes=audio_embed_sizes,
1715 audio_attention_mask=audio_attention_mask,
1716 audio_projection_mode=audio_projection_mode,
1717 wte=self.embed_tokens,
1718 )
1720 if cache_position is None:
1721 past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:769, in Phi4MMImageAudioEmbedding.forward(self, input_ids, input_embeds, input_image_embeds, input_audio_embeds, image_sizes, image_attention_mask, audio_embed_sizes, audio_attention_mask, audio_projection_mode, wte)
766 assert input_image_embeds is not None or input_audio_embeds is not None
768 if input_image_embeds is not None:
--> 769 image_hidden_states = self.image_embed(
770 input_ids=input_ids,
771 input_embeds=input_image_embeds,
772 image_sizes=image_sizes,
773 wte=wte,
774 image_attention_mask=image_attention_mask
775 )
776 if input_audio_embeds is not None:
777 audio_hidden_states = self.audio_embed(
778 input_ids=input_ids,
779 input_embeds=input_audio_embeds,
(...)
783 audio_projection_mode=audio_projection_mode,
784 )

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:328, in Phi4MMImageEmbedding.forward(self, input_ids, input_embeds, image_sizes, **kwargs)
326 # Nx(HW)xC
327 if image_attention_mask is not None and len(image_attention_mask) > 0:
--> 328 img_features = self.get_img_features(img_embeds.flatten(0, 1), attention_mask=image_attention_mask.type(torch.BoolTensor).flatten(0,1).to(target_device))
329 else:
330 img_features = self.get_img_features(img_embeds.flatten(0, 1))

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/modeling_phi4mm.py:194, in Phi4MMImageEmbedding.get_img_features(self, img_embeds, attention_mask)
192 else:
193 if attention_mask is not None:
--> 194 img_processor_output = self.img_processor(img_embeds, output_hidden_states=True, patch_attention_mask=attention_mask)
195 else:
196 img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/vision_siglip_navit.py:1370, in SiglipVisionTransformer.forward(self, pixel_values, patch_attention_mask, output_attentions, output_hidden_states, return_dict)
1359 if patch_attention_mask is None:
1360 patch_attention_mask = torch.ones(
1361 size=(
1362 batch_size,
(...)
1367 device=pixel_values.device,
1368 )
-> 1370 hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
1372 patch_attention_mask = patch_attention_mask.view(batch_size, -1)
1373 # The call to _upad_input in _flash_attention_forward is expensive
1374 # So when the patch_attention_mask is full of 1s (i.e. attending to the whole sequence),
1375 # avoiding passing the attention_mask, which is equivalent to attending to the full sequence

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()

File ~/.cache/huggingface/modules/transformers_modules/microsoft_Phi-4-multimodal-instruct/vision_siglip_navit.py:599, in SiglipVisionEmbeddings.forward(self, pixel_values, patch_attention_mask)
596 bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
598 pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
--> 599 position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
601 position_ids = position_ids.to(self.position_embedding.weight.device)
603 embeddings = embeddings + self.position_embedding(position_ids)

RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [992]

Mar 4

+1

Mar 5

@DrNicefellow Did you find a solution to this problem? I am also facing the same error

Mar 5

@PrateekTikku Nah, not gonna waste time for that kinda thing

nguyenbh changed discussion status to closed Mar 9

OJ57

Mar 12

+1

Microsoft org Mar 12

If you have inference errors, please double check the environ and dependencies, this is what we suggest
https://huggingface.co/microsoft/Phi-4-multimodal-instruct#requirements

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment