Kwai-Keye
/

Keye-VL-8B-Preview

@@ -31,19 +31,10 @@ import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
-from transformers.cache_utils import (
-    Cache,
-    DynamicCache,
-    SlidingWindowCache,
-    StaticCache,
-)
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    BaseModelOutput,
-    BaseModelOutputWithPooling,
-)
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from transformers.modeling_utils import PreTrainedModel, sdpa_attention_forward
 from transformers.activations import GELUActivation, ACT2FN, PytorchGELUTanh
@@ -55,7 +46,7 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
     torch_int,
-    is_flash_attn_greater_or_equal_2_10,
 )
 from .configuration_keye import KeyeConfig, KeyeVisionConfig
@@ -64,9 +55,9 @@ import warnings
 from typing import Any, Callable, Optional, Tuple, Union, List
 from torch import nn
 from torch.nn.init import _calculate_fan_in_and_fan_out
-assert is_flash_attn_2_available()
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_varlen_func
     from flash_attn.layers.rotary import apply_rotary_emb
@@ -80,7 +71,6 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "KeyeConfig"
 class KeyeMLP(nn.Module):
     def __init__(self, config, bias: bool = False):
         super().__init__()
@@ -92,9 +82,7 @@ class KeyeMLP(nn.Module):
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_state):
-        return self.down_proj(
-            self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)
-        )
 def _trunc_normal_(tensor, mean, std, a, b):
@@ -134,11 +122,7 @@ def _trunc_normal_(tensor, mean, std, a, b):
 def trunc_normal_tf_(
-    tensor: torch.Tensor,
-    mean: float = 0.0,
-    std: float = 1.0,
-    a: float = -2.0,
-    b: float = 2.0,
 ) -> torch.Tensor:
     """Fills the input Tensor with values drawn from a truncated
     normal distribution. The values are effectively drawn from the
@@ -196,39 +180,9 @@ def default_flax_embed_init(tensor):
     variance_scaling_(tensor, mode="fan_in", distribution="normal")
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
-class SiglipVisionModelOutput(ModelOutput):
-    """
-    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
-    Args:
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The image embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 class Projector(nn.Module):
-    def __init__(self, text_config: KeyeConfig, vision_config: KeyeVisionConfig):
         super().__init__()
         self.text_config = text_config
         self.vision_config = vision_config
@@ -247,9 +201,7 @@ class Projector(nn.Module):
             self.hidden_size, self.text_config.hidden_size, bias=True
         )
-    def forward(
-        self, image_features: torch.Tensor, image_grid_thw: List[Tuple[int, int, int]]
-    ) -> torch.Tensor:
         m1, m2 = self.merge_kernel_size
         if isinstance(image_features, (list, tuple)):
             processed_features = list()
@@ -258,15 +210,7 @@ class Projector(nn.Module):
                 t, h, w = image_grid
                 from einops import rearrange
-                image_feature = rearrange(
-                    image_feature,
-                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
-                    t=t,
-                    h=h // m1,
-                    p1=m1,
-                    w=w // m2,
-                    p2=m2,
-                )
                 hidden_states = self.linear_1(image_feature)
                 hidden_states = self.act(hidden_states)
                 hidden_states = self.linear_2(hidden_states)
@@ -284,7 +228,6 @@ class Projector(nn.Module):
         return hidden_states.view(*dims, -1)
 class SiglipVisionEmbeddings(nn.Module):
     def __init__(self, config: KeyeVisionConfig):
         super().__init__()
@@ -308,19 +251,9 @@ class SiglipVisionEmbeddings(nn.Module):
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
-        self.register_buffer(
-            "position_ids",
-            torch.arange(self.num_positions).expand((1, -1)),
-            persistent=False,
-        )
-    def interpolate_pos_encoding(
-        self,
-        embeddings: torch.Tensor,
-        height: int,
-        width: int,
-        is_after_patchify: bool = False,
-    ) -> torch.Tensor:
         """
         This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
         images. This method is also adapted to support torch.jit tracing and no class embeddings.
@@ -343,9 +276,7 @@ class SiglipVisionEmbeddings(nn.Module):
             new_width = width // self.patch_size
         sqrt_num_positions = torch_int(num_positions**0.5)
-        patch_pos_embed = patch_pos_embed.reshape(
-            1, sqrt_num_positions, sqrt_num_positions, dim
-        )
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         patch_pos_embed = nn.functional.interpolate(
@@ -373,42 +304,33 @@ class SiglipVisionEmbeddings(nn.Module):
         if grid in self.cache_position_embedding:
             self.cache_position_count[grid] += 1
             return self.cache_position_embedding[grid]
         if len(self.cache_position_embedding) >= max_cache:
-            min_hit_grid = min(
-                self.cache_position_count, key=self.cache_position_count.get
-            )
             self.cache_position_count.pop(min_hit_grid)
             self.cache_position_embedding.pop(min_hit_grid)
         position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
         self.cache_position_count[grid] = 1
         self.cache_position_embedding[grid] = position_embedding
         return position_embedding
     def forward(
-        self,
-        pixel_values: torch.FloatTensor,
         position_ids: Optional[torch.Tensor] = None,
-        image_grid_thw: Optional[
-            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
-        ] = None,
-        interpolate_pos_encoding=False,
     ) -> torch.Tensor:
         if pixel_values.dim() == 5:
             assert position_ids is not None
             from einops import rearrange
             batch_size, squence_len, channel, height, width = pixel_values.shape
             target_dtype = self.patch_embedding.weight.dtype
             pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
-            patch_embeds = self.patch_embedding(
-                pixel_values.to(dtype=target_dtype)
-            )  # shape = [*, width, grid, grid]
             embeddings = patch_embeds.flatten(-2).squeeze(-1)
-            embeddings = rearrange(
-                embeddings, "(b l) d -> b l d", b=batch_size, l=squence_len
-            )
             # todo: not dubug
             if interpolate_pos_encoding and image_grid_thw is not None:
@@ -416,21 +338,15 @@ class SiglipVisionEmbeddings(nn.Module):
                 assert batch_size == 1
                 start = 0
                 image_embedding_list = list()
-                assert (
-                    sum([np.prod(x) for x in flatten_image_grid_thw])
-                    == embeddings.shape[1]
-                ), (flatten_image_grid_thw, embeddings.shape)
                 embeddings = embeddings.squeeze(0)
                 tmp_embeddings = list()
                 for image_grid in image_grid_thw:
                     t, h, w = image_grid
                     end = start + t * h * w
-                    image_embeddings = embeddings[start:end, :]
-                    position_embedding = (
-                        self.interpolate_pos_encoding(image_embeddings, h, w, True)
-                        .squeeze(0)
-                        .repeat(t, 1)
-                    )
                     image_embeddings = image_embeddings + position_embedding
                     tmp_embeddings.append(image_embeddings)
                     start = end
@@ -456,12 +372,8 @@ def eager_attention_forward(
     if attention_mask is not None:
         attn_weights = attn_weights + attention_mask
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
-        query.dtype
-    )
-    attn_weights = nn.functional.dropout(
-        attn_weights, p=dropout, training=module.training
-    )
     attn_output = torch.matmul(attn_weights, value)
     attn_output = attn_output.transpose(1, 2).contiguous()
@@ -502,9 +414,7 @@ class SiglipAttention(nn.Module):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
-        use_flash_attn = (
-            cu_seqlens is not None
-        ) and self.config._attn_implementation == "flash_attention_2"
         batch_size, seq_length, embed_dim = hidden_states.shape
@@ -513,28 +423,21 @@ class SiglipAttention(nn.Module):
         values = self.v_proj(hidden_states)
         if rope_emb is None:
-            queries = queries.view(
-                batch_size, seq_length, self.num_heads, self.head_dim
-            ).transpose(1, 2)
-            keys = keys.view(
-                batch_size, seq_length, self.num_heads, self.head_dim
-            ).transpose(1, 2)
-            values = values.view(
-                batch_size, seq_length, self.num_heads, self.head_dim
-            ).transpose(1, 2)
         else:
             assert cu_seqlens is not None, "Rope support flash attn only."
             cos, sin = rope_emb
-            queries = queries.view(
-                batch_size, seq_length, self.num_heads, self.head_dim
-            )
             keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim)
-            queries, keys = apply_rotary_pos_emb_flashatt(queries, keys, cos, sin)
             queries = queries.transpose(1, 2)
             keys = keys.transpose(1, 2)
-            values = values.view(
-                batch_size, seq_length, self.num_heads, self.head_dim
-            ).transpose(1, 2)
         if not use_flash_attn:
             attention_interface: Callable = eager_attention_forward
@@ -557,25 +460,16 @@ class SiglipAttention(nn.Module):
                 scaling=self.scale,
                 dropout=0.0 if not self.training else self.dropout,
             )
-            attn_output = attn_output.reshape(
-                batch_size, seq_length, embed_dim
-            ).contiguous()
         else:
             assert batch_size == 1, hidden_states.shape
             queries = queries.transpose(1, 2).squeeze(0)
             keys = keys.transpose(1, 2).squeeze(0)
             values = values.transpose(1, 2).squeeze(0)
-            from flash_attn import flash_attn_func, flash_attn_varlen_func
             max_seqlen_q = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             max_seqlen_k = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-            assert (
-                cu_seqlens[-1].item()
-                == queries.shape[0]
-                == keys.shape[0]
-                == values.shape[0]
-            ), (cu_seqlens, queries.shape, keys.shape, values.shape)
             attn_output = flash_attn_varlen_func(
                 queries,
@@ -841,9 +735,7 @@ class SiglipEncoder(nn.Module):
         embed_dim = config.hidden_size
         num_heads = config.num_attention_heads
         head_dim = embed_dim // num_heads
-        self.layers = nn.ModuleList(
-            [SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]
-        )
         self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
         self.gradient_checkpointing = False
@@ -859,7 +751,6 @@ class SiglipEncoder(nn.Module):
     def build_window_index(self, image_grid, window_size, device):
         from einops import rearrange
         window_indices = list()
         pad_values = -100
         start_window_index = 0
@@ -871,25 +762,16 @@ class SiglipEncoder(nn.Module):
             pad_w = (-w) % window_size
             assert pad_h >= 0 and pad_w >= 0, (pad_h, pad_w)
             window_index = F.pad(window_index, (0, pad_w, 0, pad_h), value=pad_values)
-            window_index = rearrange(
-                window_index,
-                "t (h p1) (w p2) -> t (h w) (p1 p2)",
-                p1=window_size,
-                p2=window_size,
-            )
             window_seqlens = (window_index != pad_values).long().sum(-1).reshape(-1)
             window_index = window_index.reshape(-1)
             window_index = window_index[window_index != pad_values]
             window_indices.append(window_index + start_window_index)
-            cu_seqlens_within_windows.append(
-                window_seqlens.cumsum(0) + start_window_index
-            )
             start_window_index += t * h * w
         window_indices = torch.concat(window_indices, dim=0)
         cu_seqlens_within_windows = torch.concat(cu_seqlens_within_windows, dim=0)
-        cu_seqlens_within_windows = F.pad(
-            cu_seqlens_within_windows, (1, 0), value=0
-        ).to(torch.int32)
         return window_indices, cu_seqlens_within_windows
     # Ignore copy
@@ -901,9 +783,7 @@ class SiglipEncoder(nn.Module):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         cu_seqlens: Optional[List[torch.Tensor]] = None,
-        image_grid_thw: Optional[
-            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
-        ] = None,
         height_position_ids: Optional[torch.Tensor] = None,
         width_position_ids: Optional[torch.Tensor] = None,
         use_rope: Optional[bool] = False,
@@ -936,17 +816,11 @@ class SiglipEncoder(nn.Module):
         vision_or_text = "vision"
         assert vision_or_text in ["vision", "text"]
-        use_window_attn = window_size > 0 and vision_or_text == "vision"
         use_rope = (use_rope is True) and (vision_or_text == "vision")
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
         )
         encoder_states = () if output_hidden_states else None
@@ -954,17 +828,10 @@ class SiglipEncoder(nn.Module):
         device = inputs_embeds.device
         hidden_states = inputs_embeds
-        attention_mask = (
-            attention_mask.to(inputs_embeds.dtype)
-            if attention_mask is not None
-            else None
-        )
         if use_rope is True:
             flatten_image_grid_thw = self.flatten_list(image_grid_thw)
-            assert (
-                sum([np.prod(x) for x in flatten_image_grid_thw])
-                == hidden_states.shape[1]
-            ), (flatten_image_grid_thw, hidden_states.shape)
             if width_position_ids is None or height_position_ids is None:
                 split_hids = list()
@@ -977,13 +844,11 @@ class SiglipEncoder(nn.Module):
                     split_wids.append(sample_wids)
                 width_position_ids = torch.concat(split_wids, dim=0)
                 height_position_ids = torch.concat(split_hids, dim=0)
             window_indices, cu_seqlens_within_windows = None, None
             if use_window_attn:
-                window_indices, cu_seqlens_within_windows = self.build_window_index(
-                    flatten_image_grid_thw, window_size, device
-                )
                 reversed_window_indices = window_indices.argsort()
                 height_position_ids = height_position_ids[window_indices]
                 width_position_ids = width_position_ids[window_indices]
@@ -998,17 +863,12 @@ class SiglipEncoder(nn.Module):
             rope_emb = None
             window_indices, cu_seqlens_within_windows = None, None
             if use_window_attn:
                 flatten_image_grid_thw = self.flatten_list(image_grid_thw)
-                assert (
-                    sum([np.prod(x) for x in flatten_image_grid_thw])
-                    == hidden_states.shape[1]
-                ), (flatten_image_grid_thw, hidden_states.shape)
-                window_indices, cu_seqlens_within_windows = self.build_window_index(
-                    flatten_image_grid_thw, window_size, device
-                )
                 reversed_window_indices = window_indices.argsort()
         if use_window_attn:
@@ -1020,11 +880,7 @@ class SiglipEncoder(nn.Module):
         for encoder_layer in self.layers:
             if output_hidden_states:
-                encoder_states = encoder_states + (
-                    (hidden_states[:, reversed_window_indices, :],)
-                    if use_window_attn
-                    else (hidden_states,)
-                )
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     encoder_layer.__call__,
@@ -1070,17 +926,13 @@ class SiglipVisionTransformer(nn.Module):
         self.embeddings = SiglipVisionEmbeddings(config)
         self.encoder = SiglipEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.use_head = (
-            True if not hasattr(config, "vision_use_head") else config.vision_use_head
-        )
         if self.use_head:
             self.head = SiglipMultiheadAttentionPoolingHead(config)
     # @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=BaseModelOutputWithPooling, config_class=KeyeVisionConfig
-    )
     def forward(
         self,
         pixel_values,
@@ -1096,9 +948,7 @@ class SiglipVisionTransformer(nn.Module):
         cu_seqlens: Optional[List[torch.Tensor]] = None,
         padding_mask: Optional[torch.Tensor] = None,
         vision_return_embed_list: Optional[bool] = False,
-        image_grid_thw: Optional[
-            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
-        ] = None,
         return_pooler_output: Optional[bool] = True,
         use_rope: Optional[bool] = False,
         window_size: Optional[bool] = -1,
@@ -1107,21 +957,15 @@ class SiglipVisionTransformer(nn.Module):
         Returns:
         """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
         )
         hidden_states = self.embeddings(
-            pixel_values,
-            interpolate_pos_encoding=interpolate_pos_encoding,
             position_ids=position_ids,
-            image_grid_thw=image_grid_thw,
         )
         encoder_outputs: BaseModelOutput = self.encoder(
@@ -1157,32 +1001,22 @@ class SiglipVisionTransformer(nn.Module):
                     token_indices = (sample_index == sample_idx).nonzero().flatten()
                     sample_hidden_state = hidden_state[token_indices]
                     sample_hidden_state_list.append(sample_hidden_state)
                 if not vision_return_embed_list:
-                    max_length = max(
-                        [_state.shape[0] for _state in sample_hidden_state_list]
-                    )
                     tmp_sample_hidden_state_list = list()
                     padding_mask = list()
                     for idx, _state in enumerate(sample_hidden_state_list):
                         padding_length = max_length - _state.shape[0]
-                        mask = _state.new_zeros(size=(max_length,), dtype=torch.int64)
-                        mask[-padding_length:] = 1
                         padding_mask.append(mask)
                         padding = _state.new_zeros(size=(padding_length, dim))
                         new_state = torch.concat([_state, padding], dim=0)
                         tmp_sample_hidden_state_list.append(new_state)
-                    sample_hidden_state = torch.stack(
-                        tmp_sample_hidden_state_list, dim=0
-                    )
-                    padding_mask = (
-                        torch.stack(padding_mask, dim=0)
-                        .float()
-                        .to(last_hidden_state.dtype)
-                    )
-                    pooler_output = self.head(
-                        sample_hidden_state, key_padding_mask=padding_mask
-                    )
                 else:
                     pooler_output = list()
                     for state in sample_hidden_state_list:
@@ -1206,15 +1040,15 @@ class SiglipVisionTransformer(nn.Module):
                 hidden_states=encoder_outputs.hidden_states,
                 attentions=encoder_outputs.attentions,
             )
         sample_hidden_state = list()
         assert cu_seqlens is not None
         for i in range(cu_seqlens.shape[0] - 1):
             start = cu_seqlens[i]
             end = cu_seqlens[i + 1]
-            tensor = last_hidden_state[:, start:end, :].squeeze(0)
             sample_hidden_state.append(tensor)
         return BaseModelOutputWithPooling(
             last_hidden_state=sample_hidden_state,
             pooler_output=None,
@@ -1230,9 +1064,7 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
         super().__init__()
         self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
-        self.attention = torch.nn.MultiheadAttention(
-            config.hidden_size, config.num_attention_heads, batch_first=True
-        )
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(config)
@@ -1240,9 +1072,7 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
         batch_size = hidden_state.shape[0]
         probe = self.probe.repeat(batch_size, 1, 1)
-        hidden_state = self.attention(
-            probe, hidden_state, hidden_state, key_padding_mask=key_padding_mask
-        )[0]
         residual = hidden_state
         hidden_state = self.layernorm(hidden_state)
@@ -1272,9 +1102,7 @@ class SiglipVisionModel(SiglipPreTrainedModel):
     # @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=BaseModelOutputWithPooling, config_class=KeyeVisionConfig
-    )
     def forward(
         self,
         pixel_values,
@@ -1284,9 +1112,7 @@ class SiglipVisionModel(SiglipPreTrainedModel):
         interpolate_pos_encoding: bool = False,
         position_ids: Optional[torch.Tensor] = None,
         vision_return_embed_list: Optional[bool] = False,
-        image_grid_thw: Optional[
-            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
-        ] = None,
         cu_seqlens: Optional[List[torch.Tensor]] = None,
         return_pooler_output: Optional[bool] = True,
         use_rope: Optional[bool] = False,
@@ -1331,6 +1157,7 @@ class SiglipVisionModel(SiglipPreTrainedModel):
         )
 class Qwen3RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -1377,6 +1204,7 @@ def apply_rotary_pos_emb_flashatt(
     return q_embed, k_embed
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -1397,156 +1225,6 @@ def apply_rotary_pos_emb_vision(
     k_embed = k_embed.to(orig_k_dtype)
     return q_embed, k_embed
-class KeyeVisionAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 16) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.qkv = nn.Linear(dim, dim * 3, bias=True)
-        self.proj = nn.Linear(dim, dim)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        cu_seqlens: torch.Tensor,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        seq_length = hidden_states.shape[0]
-        q, k, v = (
-            self.qkv(hidden_states)
-            .reshape(seq_length, self.num_heads, 3, -1)
-            .permute(2, 0, 1, 3)
-            .unbind(0)
-        )
-        if position_embeddings is None:
-            logger.warning_once(
-                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
-                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
-                "removed and `position_embeddings` will be mandatory."
-            )
-            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        else:
-            cos, sin = position_embeddings
-        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
-        attention_mask = torch.full(
-            [1, seq_length, seq_length],
-            torch.finfo(q.dtype).min,
-            device=q.device,
-            dtype=q.dtype,
-        )
-        for i in range(1, len(cu_seqlens)):
-            attention_mask[
-                ...,
-                cu_seqlens[i - 1] : cu_seqlens[i],
-                cu_seqlens[i - 1] : cu_seqlens[i],
-            ] = 0
-        q = q.transpose(0, 1)
-        k = k.transpose(0, 1)
-        v = v.transpose(0, 1)
-        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(q.dtype)
-        attn_output = torch.matmul(attn_weights, v)
-        attn_output = attn_output.transpose(0, 1)
-        attn_output = attn_output.reshape(seq_length, -1)
-        attn_output = self.proj(attn_output)
-        return attn_output
-class KeyeVisionSdpaAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 16) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.qkv = nn.Linear(dim, dim * 3, bias=True)
-        self.proj = nn.Linear(dim, dim)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        cu_seqlens: torch.Tensor,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        seq_length = hidden_states.shape[0]
-        # q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
-        q, k, v = (
-            self.qkv(hidden_states)
-            .reshape(seq_length, self.num_heads, 3, -1)
-            .permute(2, 0, 1, 3)
-            .unbind(0)
-        )
-        if position_embeddings is None:
-            logger.warning_once(
-                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
-                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
-                "removed and `position_embeddings` will be mandatory."
-            )
-            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        else:
-            cos, sin = position_embeddings
-        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
-        attention_mask = torch.zeros(
-            [1, seq_length, seq_length], device=q.device, dtype=torch.bool
-        )
-        for i in range(1, len(cu_seqlens)):
-            attention_mask[
-                ...,
-                cu_seqlens[i - 1] : cu_seqlens[i],
-                cu_seqlens[i - 1] : cu_seqlens[i],
-            ] = True
-        q = q.transpose(0, 1)
-        k = k.transpose(0, 1)
-        v = v.transpose(0, 1)
-        attn_output = F.scaled_dot_product_attention(
-            q, k, v, attention_mask, dropout_p=0.0
-        )
-        attn_output = attn_output.transpose(0, 1)
-        attn_output = attn_output.reshape(seq_length, -1)
-        attn_output = self.proj(attn_output)
-        return attn_output
-class KeyeVisionBlock(nn.Module):
-    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
-        super().__init__()
-        self.norm1 = Qwen3RMSNorm(config.hidden_size, eps=1e-6)
-        self.norm2 = Qwen3RMSNorm(config.hidden_size, eps=1e-6)
-        assert attn_implementation == "flash_attention_2"
-        self.attn = QWEN3_ATTENTION_CLASSES[attn_implementation](
-            config.hidden_size, num_heads=config.num_heads
-        )
-        self.mlp = KeyeMLP(config, bias=True)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        cu_seqlens: torch.Tensor,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        hidden_states = hidden_states + self.attn(
-            self.norm1(hidden_states),
-            cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
-            position_embeddings=position_embeddings,
-        )
-        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
-        return hidden_states
 Keye_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -1572,7 +1250,7 @@ class Qwen3PreTrainedModel(PreTrainedModel):
     config_class = KeyeConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["KeyeDecoderLayer", "KeyeVisionBlock"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -1591,6 +1269,7 @@ class Qwen3PreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
 class SigLIPRotaryEmbedding(nn.Module):
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
@@ -1599,15 +1278,11 @@ class SigLIPRotaryEmbedding(nn.Module):
         self.rope_init()
     def rope_init(self):
-        inv_freq = 1.0 / (
-            self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
-        )
         self.register_buffer("inv_freq", inv_freq, persistent=False)
     def forward(self, seqlen: int) -> torch.Tensor:
-        seq = torch.arange(
-            seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-        )
         freqs = torch.outer(seq, self.inv_freq)
         return freqs
@@ -1634,19 +1309,15 @@ class KeyeRotaryEmbedding(nn.Module):
         else:
             # BC: "rope_type" was originally "type"
             if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get(
-                    "rope_type", config.rope_scaling.get("type")
-                )
             else:
                 self.rope_type = "default"
             self.max_seq_len_cached = config.max_position_embeddings
             self.original_max_seq_len = config.max_position_embeddings
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            self.rope_type = config.rope_scaling.get(
-                "rope_type", config.rope_scaling.get("type")
-            )
         else:
             self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
@@ -1670,15 +1341,10 @@ class KeyeRotaryEmbedding(nn.Module):
             inv_freq, self.attention_scaling = self.rope_init_fn(
                 self.config, device, seq_len=seq_len, **self.rope_kwargs
             )
-            self.register_buffer(
-                "inv_freq", inv_freq, persistent=False
-            )  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
-        if (
-            seq_len < self.original_max_seq_len
-            and self.max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
@@ -1689,25 +1355,13 @@ class KeyeRotaryEmbedding(nn.Module):
         # Core RoPE block. In contrast to other models, Keye has different position ids for the grids
         # So we expand the inv_freq to shape (3, ...)
-        inv_freq_expanded = (
-            self.inv_freq[None, None, :, None]
-            .float()
-            .expand(3, position_ids.shape[1], -1, 1)
-        )
-        position_ids_expanded = position_ids[
-            :, :, None, :
-        ].float()  # shape (3, bs, 1, positions)
         # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
         device_type = x.device.type
-        device_type = (
-            device_type
-            if isinstance(device_type, str) and device_type != "mps"
-            else "cpu"
-        )
         with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (
-                inv_freq_expanded.float() @ position_ids_expanded.float()
-            ).transpose(2, 3)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos()
             sin = emb.sin()
@@ -1777,12 +1431,12 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
     mrope_section = mrope_section * 2
-    cos = torch.cat(
-        [m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1
-    ).unsqueeze(unsqueeze_dim)
-    sin = torch.cat(
-        [m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1
-    ).unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
@@ -1797,9 +1451,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(
-        batch, num_key_value_heads, n_rep, slen, head_dim
-    )
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
@@ -1822,43 +1474,27 @@ class KeyeAttention(nn.Module):
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
-        self.head_dim = getattr(
-            config, "head_dim", config.hidden_size // config.num_attention_heads
-        )
         self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = (
-            config.num_attention_heads // config.num_key_value_heads
-        )
         self.is_causal = True
         self.attention_dropout = config.attention_dropout
         self.rope_scaling = config.rope_scaling
         self.q_proj = nn.Linear(
-            config.hidden_size,
-            config.num_attention_heads * self.head_dim,
-            bias=config.attention_bias,
         )
         self.k_proj = nn.Linear(
-            config.hidden_size,
-            config.num_key_value_heads * self.head_dim,
-            bias=config.attention_bias,
         )
         self.v_proj = nn.Linear(
-            config.hidden_size,
-            config.num_key_value_heads * self.head_dim,
-            bias=config.attention_bias,
         )
         self.o_proj = nn.Linear(
-            config.num_attention_heads * self.head_dim,
-            config.hidden_size,
-            bias=config.attention_bias,
         )
-        self.q_norm = Qwen3RMSNorm(
-            self.head_dim, eps=config.rms_norm_eps
-        )  # unlike olmo, only on the head dim!
-        self.k_norm = Qwen3RMSNorm(
-            self.head_dim, eps=config.rms_norm_eps
-        )  # thus post q_norm does not need reshape
         self.rotary_emb = KeyeRotaryEmbedding(config=config)
@@ -1871,18 +1507,12 @@ class KeyeAttention(nn.Module):
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[
-            Tuple[torch.Tensor, torch.Tensor]
-        ] = None,  # necessary, but kept here for BC
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_norm(
-            self.q_proj(hidden_states).view(bsz, q_len, -1, self.head_dim)
-        )
-        key_states = self.k_norm(
-            self.k_proj(hidden_states).view(bsz, q_len, -1, self.head_dim)
-        )
         value_states = self.v_proj(hidden_states)
         query_states = query_states.transpose(1, 2)
@@ -1895,22 +1525,15 @@ class KeyeAttention(nn.Module):
         )
         if past_key_value is not None:
-            cache_kwargs = {
-                "sin": sin,
-                "cos": cos,
-                "cache_position": cache_position,
-            }  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_weights = torch.matmul(
-            query_states, key_states.transpose(2, 3)
-        ) / math.sqrt(self.head_dim)
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
@@ -1919,17 +1542,11 @@ class KeyeAttention(nn.Module):
         # Fix precision issues in float16 inference
         # Replace inf values with zeros in attention weights to prevent NaN propagation
         if query_states.dtype == torch.float16:
-            attn_weights = torch.where(
-                torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights
-            )
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training
-        )
         attn_output = torch.matmul(attn_weights, value_states)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
@@ -1975,19 +1592,15 @@ class KeyeFlashAttention2(KeyeAttention):
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[
-            Tuple[torch.Tensor, torch.Tensor]
-        ] = None,  # necessary, but kept here for BC
         cu_seqlens: Optional[torch.Tensor] = None,
-        sliding_window=-1,
         **kwargs,
     ):
         bsz, q_len, _ = hidden_states.size()
-        q = self.q_proj(hidden_states).view(bsz, q_len, -1, self.head_dim)
         query_states = self.q_norm(q)
-        key_states = self.k_norm(
-            self.k_proj(hidden_states).view(bsz, q_len, -1, self.head_dim)
-        )
         value_states = self.v_proj(hidden_states)
         query_states = query_states.transpose(1, 2)
@@ -2001,20 +1614,14 @@ class KeyeFlashAttention2(KeyeAttention):
         )
         if past_key_value is not None:
-            cache_kwargs = {
-                "sin": sin,
-                "cos": cos,
-                "cache_position": cache_position,
-            }  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
@@ -2068,7 +1675,7 @@ class KeyeFlashAttention2(KeyeAttention):
                 max_seqlen,
                 dropout_p=dropout_rate,
                 window_size=(sliding_window, sliding_window),
-                causal=self.is_causal,
             )
         else:
             attn_output = _flash_attention_forward(
@@ -2108,9 +1715,7 @@ class KeyeSdpaAttention(KeyeAttention):
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[
-            Tuple[torch.Tensor, torch.Tensor]
-        ] = None,  # necessary, but kept here for BC
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -2131,12 +1736,8 @@ class KeyeSdpaAttention(KeyeAttention):
         bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_norm(
-            self.q_proj(hidden_states).view(bsz, q_len, -1, self.head_dim)
-        )
-        key_states = self.k_norm(
-            self.k_proj(hidden_states).view(bsz, q_len, -1, self.head_dim)
-        )
         value_states = self.v_proj(hidden_states)
         query_states = query_states.transpose(1, 2)
@@ -2149,14 +1750,8 @@ class KeyeSdpaAttention(KeyeAttention):
         )
         if past_key_value is not None:
-            cache_kwargs = {
-                "sin": sin,
-                "cos": cos,
-                "cache_position": cache_position,
-            }  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -2194,6 +1789,7 @@ class KeyeSdpaAttention(KeyeAttention):
         return attn_output, None, past_key_value
 QWEN3_ATTENTION_CLASSES = {
     "eager": KeyeAttention,
     "flash_attention_2": KeyeFlashAttention2,
@@ -2205,24 +1801,17 @@ class KeyeDecoderLayer(nn.Module):
     def __init__(self, config: KeyeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        if (
-            config.use_sliding_window
-            and config._attn_implementation != "flash_attention_2"
-        ):
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered."
             )
-        self.self_attn = QWEN3_ATTENTION_CLASSES[config._attn_implementation](
-            config, layer_idx
-        )
         self.mlp = Qwen3MLP(config)
         self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen3RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
     def forward(
         self,
@@ -2233,13 +1822,9 @@ class KeyeDecoderLayer(nn.Module):
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[
-            Tuple[torch.Tensor, torch.Tensor]
-        ] = None,  # necessary, but kept here for BC
         **kwargs,
-    ) -> Tuple[
-        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
-    ]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -2275,7 +1860,7 @@ class KeyeDecoderLayer(nn.Module):
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
-            **kwargs,
         )
         hidden_states = residual + hidden_states
@@ -2291,6 +1876,7 @@ class KeyeDecoderLayer(nn.Module):
         if output_attentions:
             outputs += (self_attn_weights,)
         if use_cache:
             outputs += (present_key_value,)
@@ -2307,14 +1893,9 @@ class Qwen3Model(Qwen3PreTrainedModel):
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(
-            config.vocab_size, config.hidden_size, self.padding_idx
-        )
         self.layers = nn.ModuleList(
-            [
-                KeyeDecoderLayer(config, layer_idx)
-                for layer_idx in range(config.num_hidden_layers)
-            ]
         )
         self._attn_implementation = config._attn_implementation
         self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -2342,28 +1923,18 @@ class Qwen3Model(Qwen3PreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You must specify exactly one of input_ids or inputs_embeds"
-            )
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -2380,29 +1951,19 @@ class Qwen3Model(Qwen3PreTrainedModel):
             inputs_embeds = self.embed_tokens(input_ids)
         if cache_position is None:
-            past_seen_tokens = (
-                past_key_values.get_seq_length() if past_key_values is not None else 0
-            )
             cache_position = torch.arange(
-                past_seen_tokens,
-                past_seen_tokens + inputs_embeds.shape[1],
-                device=inputs_embeds.device,
             )
         # the hard coded `3` is for temporal, height and width.
         if position_ids is None:
-            position_ids = cache_position.view(1, 1, -1).expand(
-                3, inputs_embeds.shape[0], -1
-            )
         elif position_ids.dim() == 2:
             position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
         causal_mask = self._update_causal_mask(
-            attention_mask,
-            inputs_embeds,
-            cache_position,
-            past_key_values,
-            output_attentions,
         )
         hidden_states = inputs_embeds
@@ -2462,11 +2023,7 @@ class Qwen3Model(Qwen3PreTrainedModel):
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None
-            )
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -2484,9 +2041,7 @@ class Qwen3Model(Qwen3PreTrainedModel):
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
-                is_padding_right = (
-                    attention_mask[:, -1].sum().item() != input_tensor.size()[0]
-                )
                 if is_padding_right:
                     raise ValueError(
                         "You are attempting to perform batched generation with padding_side='right'"
@@ -2500,9 +2055,7 @@ class Qwen3Model(Qwen3PreTrainedModel):
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
         # to infer the attention mask.
-        past_seen_tokens = (
-            past_key_values.get_seq_length() if past_key_values is not None else 0
-        )
         using_static_cache = isinstance(past_key_values, StaticCache)
         using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
@@ -2557,9 +2110,7 @@ class Qwen3Model(Qwen3PreTrainedModel):
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
-            causal_mask = AttentionMaskConverter._unmask_unattended(
-                causal_mask, min_dtype
-            )
         return causal_mask
@@ -2605,41 +2156,31 @@ class Qwen3Model(Qwen3PreTrainedModel):
         else:
             min_dtype = torch.finfo(dtype).min
             causal_mask = torch.full(
-                (sequence_length, target_length),
-                fill_value=min_dtype,
-                dtype=dtype,
-                device=device,
             )
-            diagonal_attend_mask = torch.arange(
-                target_length, device=device
-            ) > cache_position.reshape(-1, 1)
             if config.sliding_window is not None:
                 # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                 # the check is needed to verify is current checkpoint was trained with sliding window or not
-                if (
-                    not isinstance(past_key_values, SlidingWindowCache)
-                    or sequence_length > target_length
-                ):
-                    sliding_attend_mask = torch.arange(
-                        target_length, device=device
-                    ) <= (cache_position.reshape(-1, 1) - config.sliding_window)
                     diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
-                causal_mask = (
-                    causal_mask.clone()
-                )  # copy to contiguous memory for in-place edit
                 if attention_mask.shape[-1] > target_length:
                     attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[
-                    :, None, None, :
-                ].to(causal_mask.device)
                 padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :mask_length] = causal_mask[
-                    :, :, :, :mask_length
-                ].masked_fill(padding_mask, min_dtype)
         return causal_mask
@@ -2699,6 +2240,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
         # Initialize weights and apply final processing
         self.post_init()
     def get_input_embeddings(self):
         return self.model.embed_tokens
@@ -2783,9 +2325,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
         video_token_id = self.config.video_token_id
         vision_start_token_id = self.config.vision_start_token_id
         mrope_position_deltas = []
-        if input_ids is not None and (
-            image_grid_thw is not None or video_grid_thw is not None
-        ):
             total_input_ids = input_ids
             if attention_mask is None:
                 attention_mask = torch.ones_like(total_input_ids)
@@ -2801,9 +2341,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
             for i, input_ids in enumerate(total_input_ids):
                 input_ids = input_ids[attention_mask[i] == 1]
                 image_nums, video_nums = 0, 0
-                vision_start_indices = torch.argwhere(
-                    input_ids == vision_start_token_id
-                ).squeeze(1)
                 vision_tokens = input_ids[vision_start_indices + 1]
                 image_nums = (vision_tokens == image_token_id).sum()
                 video_nums = (vision_tokens == video_token_id).sum()
@@ -2851,80 +2389,39 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
                     )
                     text_len = ed - st
-                    st_idx = (
-                        llm_pos_ids_list[-1].max() + 1
-                        if len(llm_pos_ids_list) > 0
-                        else 0
-                    )
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-                    )
-                    if torch.is_tensor(second_per_grid_t):
-                        second_per_grid_t = second_per_grid_t.detach().item()
                     range_tensor = torch.arange(llm_grid_t).view(-1, 1)
                     expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
-                    time_tensor = (
-                        expanded_range
-                        * second_per_grid_t
-                        * self.config.vision_config.tokens_per_second
-                    )
                     time_tensor_long = time_tensor.long()
                     t_index = time_tensor_long.flatten()
-                    h_index = (
-                        torch.arange(llm_grid_h)
-                        .view(1, -1, 1)
-                        .expand(llm_grid_t, -1, llm_grid_w)
-                        .flatten()
-                    )
-                    w_index = (
-                        torch.arange(llm_grid_w)
-                        .view(1, 1, -1)
-                        .expand(llm_grid_t, llm_grid_h, -1)
-                        .flatten()
-                    )
-                    llm_pos_ids_list.append(
-                        torch.stack([t_index, h_index, w_index]) + text_len + st_idx
-                    )
                     st = ed + llm_grid_t * llm_grid_h * llm_grid_w
                 if st < len(input_tokens):
-                    st_idx = (
-                        llm_pos_ids_list[-1].max() + 1
-                        if len(llm_pos_ids_list) > 0
-                        else 0
-                    )
                     text_len = len(input_tokens) - st
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-                    )
                 llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(
-                    position_ids.device
-                )
-                mrope_position_deltas.append(
-                    llm_positions.max() + 1 - len(total_input_ids[i])
-                )
-            mrope_position_deltas = torch.tensor(
-                mrope_position_deltas, device=input_ids.device
-            ).unsqueeze(1)
             return position_ids, mrope_position_deltas
         else:
             if attention_mask is not None:
                 position_ids = attention_mask.long().cumsum(-1) - 1
                 position_ids.masked_fill_(attention_mask == 0, 1)
-                position_ids = (
-                    position_ids.unsqueeze(0)
-                    .expand(3, -1, -1)
-                    .to(attention_mask.device)
-                )
-                max_position_ids = position_ids.max(0, keepdim=False)[0].max(
-                    -1, keepdim=True
-                )[0]
                 mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
             else:
                 position_ids = (
@@ -2940,9 +2437,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
             return position_ids, mrope_position_deltas
-    @replace_return_docstrings(
-        output_type=KeyeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-    )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -2962,7 +2457,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
         rope_deltas: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         second_per_grid_ts: Optional[torch.Tensor] = None,
-        **kwargs,
     ) -> Union[Tuple, KeyeCausalLMOutputWithPast]:
         r"""
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -3003,19 +2498,11 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
         "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
         ```"""
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
         )
         if inputs_embeds is None:
             inputs_embeds = self.model.embed_tokens(input_ids)
@@ -3034,21 +2521,15 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
                     image_grid_hws.append(thw_tuple)
                     image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
                     siglip_position_ids.append(image_position_ids)
-                    sample_indices.append(torch.full((numel,), idx, dtype=torch.int64))
                     cu_seqlens.append(cu_seqlens[-1] + numel)
-                siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
-                    pixel_values.device
-                )
-                cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
-                    pixel_values.device
-                )
-                sample_indices = torch.concat(sample_indices, dim=0).to(
-                    pixel_values.device
-                )
                 vision_outputs = self.visual(
-                    pixel_values=pixel_values,
                     image_grid_thw=image_grid_hws,
                     position_ids=siglip_position_ids,
                     vision_return_embed_list=True,
@@ -3057,29 +2538,27 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
                     cu_seqlens=cu_seqlens,
                     return_pooler_output=False,
                     use_rope=True,
-                    window_size=-1,
                 )
                 image_embeds = vision_outputs.last_hidden_state
                 image_embeds = self.mlp_AR(image_embeds, image_grid_thw)
                 n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
-                # image_embeds is a list of tensor, each tensor is a image feature,I want to concat them all into a tensor
-                image_embeds = torch.cat(image_embeds, dim=0)
                 n_image_features = image_embeds.shape[0]
                 if n_image_tokens != n_image_features:
                     raise ValueError(
                         f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                     )
-                mask = input_ids == self.config.image_token_id
                 mask_unsqueezed = mask.unsqueeze(-1)
                 mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
                 image_mask = mask_expanded.to(inputs_embeds.device)
-                image_embeds = image_embeds.to(
-                    inputs_embeds.device, inputs_embeds.dtype
-                )
                 inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
@@ -3098,20 +2577,14 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
                     video_grid_hws.append(thw_tuple)
                     video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
                     siglip_position_ids.append(video_position_ids)
-                    sample_indices.append(torch.full((numel,), idx, dtype=torch.int64))
                     cu_seqlens.append(cu_seqlens[-1] + numel)
-                siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
-                    pixel_values_videos.device
-                )
-                cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
-                    pixel_values_videos.device
-                )
-                sample_indices = torch.concat(sample_indices, dim=0).to(
-                    pixel_values_videos.device
-                )
                 vision_outputs = self.visual(
-                    pixel_values=pixel_values_videos,
                     image_grid_thw=video_grid_hws,
                     position_ids=siglip_position_ids,
                     vision_return_embed_list=True,
@@ -3120,12 +2593,12 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
                     cu_seqlens=cu_seqlens,
                     return_pooler_output=False,
                     use_rope=True,
-                    window_size=-1,
                 )
                 video_embeds = vision_outputs.last_hidden_state
                 video_embeds = self.mlp_AR(video_embeds, video_grid_thw)
                 n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
-                video_embeds = torch.cat(video_embeds, dim=0)
                 n_video_features = video_embeds.shape[0]
                 if n_video_tokens != n_video_features:
                     raise ValueError(
@@ -3137,18 +2610,14 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
                 mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
                 video_mask = mask_expanded.to(inputs_embeds.device)
-                video_embeds = video_embeds.to(
-                    inputs_embeds.device, inputs_embeds.dtype
-                )
                 inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
         # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
-        if position_ids is None and (
-            attention_mask is None or attention_mask.ndim == 2
-        ):
             # calculate RoPE index once per generation in the pre-fill stage only
             if (
                 (cache_position is not None and cache_position[0] == 0)
@@ -3189,7 +2658,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
-            **kwargs,
         )
         hidden_states = outputs[0]
@@ -3309,13 +2778,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
         if expand_size == 1:
             return input_ids, model_kwargs
-        visual_keys = [
-            "pixel_values",
-            "image_grid_thw",
-            "pixel_values_videos",
-            "video_grid_thw",
-            "second_per_grid_ts",
-        ]
         def _expand_dict_for_generation_visual(dict_to_expand):
             image_grid_thw = model_kwargs.get("image_grid_thw", None)
@@ -3325,9 +2788,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
             def _repeat_interleave_samples(x, lengths, repeat_times):
                 samples = torch.split(x, lengths)
                 repeat_args = [repeat_times] + [1] * (x.dim() - 1)
-                result = torch.cat(
-                    [sample.repeat(*repeat_args) for sample in samples], dim=0
-                )
                 return result
             for key in dict_to_expand:
@@ -3363,9 +2824,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
                         )
                     tensor = torch.tensor(dict_to_expand[key])
                     lengths = list(video_nums)
-                    tensor = _repeat_interleave_samples(
-                        tensor, lengths=lengths, repeat_times=expand_size
-                    )
                     dict_to_expand[key] = tensor.tolist()
             return dict_to_expand
@@ -3377,9 +2836,7 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
                     and isinstance(dict_to_expand[key], torch.Tensor)
                     and key not in visual_keys
                 ):
-                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(
-                        expand_size, dim=0
-                    )
             return dict_to_expand
         # input_ids is required for expanding visual inputs
@@ -3394,11 +2851,15 @@ class KeyeForConditionalGeneration(Qwen3PreTrainedModel, GenerationMixin):
         if is_encoder_decoder:
             if model_kwargs.get("encoder_outputs") is None:
-                raise ValueError(
-                    "If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
-                )
-            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(
-                model_kwargs["encoder_outputs"]
-            )
         return input_ids, model_kwargs

 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutput, BaseModelOutputWithPooling
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from transformers.modeling_utils import PreTrainedModel, sdpa_attention_forward
 from transformers.activations import GELUActivation, ACT2FN, PytorchGELUTanh
     logging,
     replace_return_docstrings,
     torch_int,
+    is_flash_attn_greater_or_equal_2_10
 )
 from .configuration_keye import KeyeConfig, KeyeVisionConfig
 from typing import Any, Callable, Optional, Tuple, Union, List
 from torch import nn
 from torch.nn.init import _calculate_fan_in_and_fan_out
+from einops import repeat
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_varlen_func
     from flash_attn.layers.rotary import apply_rotary_emb
 _CONFIG_FOR_DOC = "KeyeConfig"
 class KeyeMLP(nn.Module):
     def __init__(self, config, bias: bool = False):
         super().__init__()
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
 def _trunc_normal_(tensor, mean, std, a, b):
 def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
 ) -> torch.Tensor:
     """Fills the input Tensor with values drawn from a truncated
     normal distribution. The values are effectively drawn from the
     variance_scaling_(tensor, mode="fan_in", distribution="normal")
 class Projector(nn.Module):
+    def __init__(self, text_config: KeyeConfig,vision_config: KeyeVisionConfig):
         super().__init__()
         self.text_config = text_config
         self.vision_config = vision_config
             self.hidden_size, self.text_config.hidden_size, bias=True
         )
+    def forward(self, image_features: torch.Tensor, image_grid_thw: List[Tuple[int, int, int]]) -> torch.Tensor:
         m1, m2 = self.merge_kernel_size
         if isinstance(image_features, (list, tuple)):
             processed_features = list()
                 t, h, w = image_grid
                 from einops import rearrange
+                image_feature = rearrange(image_feature, "(t h p1 w p2) d -> (t h w) (p1 p2 d)", t=t, h=h // m1, p1=m1, w=w // m2, p2=m2)
                 hidden_states = self.linear_1(image_feature)
                 hidden_states = self.act(hidden_states)
                 hidden_states = self.linear_2(hidden_states)
         return hidden_states.view(*dims, -1)
 class SiglipVisionEmbeddings(nn.Module):
     def __init__(self, config: KeyeVisionConfig):
         super().__init__()
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int, is_after_patchify: bool = False) -> torch.Tensor:
         """
         This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
         images. This method is also adapted to support torch.jit tracing and no class embeddings.
             new_width = width // self.patch_size
         sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         patch_pos_embed = nn.functional.interpolate(
         if grid in self.cache_position_embedding:
             self.cache_position_count[grid] += 1
             return self.cache_position_embedding[grid]
         if len(self.cache_position_embedding) >= max_cache:
+            min_hit_grid = min(self.cache_position_count, key=self.cache_position_count.get)
             self.cache_position_count.pop(min_hit_grid)
             self.cache_position_embedding.pop(min_hit_grid)
         position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
         self.cache_position_count[grid] = 1
         self.cache_position_embedding[grid] = position_embedding
         return position_embedding
     def forward(
+        self,
+        pixel_values: torch.FloatTensor,
         position_ids: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]] = None,
+        interpolate_pos_encoding=False
     ) -> torch.Tensor:
         if pixel_values.dim() == 5:
             assert position_ids is not None
             from einops import rearrange
             batch_size, squence_len, channel, height, width = pixel_values.shape
             target_dtype = self.patch_embedding.weight.dtype
             pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
             embeddings = patch_embeds.flatten(-2).squeeze(-1)
+            embeddings = rearrange(embeddings, "(b l) d -> b l d", b=batch_size, l=squence_len)
             # todo: not dubug
             if interpolate_pos_encoding and image_grid_thw is not None:
                 assert batch_size == 1
                 start = 0
                 image_embedding_list = list()
+                assert sum([np.prod(x) for x in flatten_image_grid_thw]) == embeddings.shape[1], (flatten_image_grid_thw, embeddings.shape)
                 embeddings = embeddings.squeeze(0)
                 tmp_embeddings = list()
                 for image_grid in image_grid_thw:
                     t, h, w = image_grid
                     end = start + t * h * w
+                    image_embeddings = embeddings[start: end, :]
+                    position_embedding = self.interpolate_pos_encoding(image_embeddings, h, w, True).squeeze(0).repeat(
+                        t, 1)
                     image_embeddings = image_embeddings + position_embedding
                     tmp_embeddings.append(image_embeddings)
                     start = end
     if attention_mask is not None:
         attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
     attn_output = torch.matmul(attn_weights, value)
     attn_output = attn_output.transpose(1, 2).contiguous()
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
+        use_flash_attn = (cu_seqlens is not None) and self.config._attn_implementation == "flash_attention_2"
         batch_size, seq_length, embed_dim = hidden_states.shape
         values = self.v_proj(hidden_states)
         if rope_emb is None:
+            queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+            keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+            values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
         else:
             assert cu_seqlens is not None, "Rope support flash attn only."
             cos, sin = rope_emb
+            queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim)
             keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim)
+            if use_flash_attn:
+                queries, keys = apply_rotary_pos_emb_flashatt(queries, keys, cos, sin)
+            else:
+                queries, keys = apply_rotary_pos_emb_vision(queries, keys, cos, sin)
             queries = queries.transpose(1, 2)
             keys = keys.transpose(1, 2)
+            values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
         if not use_flash_attn:
             attention_interface: Callable = eager_attention_forward
                 scaling=self.scale,
                 dropout=0.0 if not self.training else self.dropout,
             )
+            attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
         else:
             assert batch_size == 1, hidden_states.shape
             queries = queries.transpose(1, 2).squeeze(0)
             keys = keys.transpose(1, 2).squeeze(0)
             values = values.transpose(1, 2).squeeze(0)
             max_seqlen_q = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             max_seqlen_k = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            assert cu_seqlens[-1].item() == queries.shape[0] == keys.shape[0] == values.shape[0], (cu_seqlens, queries.shape, keys.shape, values.shape)
             attn_output = flash_attn_varlen_func(
                 queries,
         embed_dim = config.hidden_size
         num_heads = config.num_attention_heads
         head_dim = embed_dim // num_heads
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
         self.gradient_checkpointing = False
     def build_window_index(self, image_grid, window_size, device):
         from einops import rearrange
         window_indices = list()
         pad_values = -100
         start_window_index = 0
             pad_w = (-w) % window_size
             assert pad_h >= 0 and pad_w >= 0, (pad_h, pad_w)
             window_index = F.pad(window_index, (0, pad_w, 0, pad_h), value=pad_values)
+            window_index = rearrange(window_index, "t (h p1) (w p2) -> t (h w) (p1 p2)", p1=window_size, p2=window_size)
             window_seqlens = (window_index != pad_values).long().sum(-1).reshape(-1)
             window_index = window_index.reshape(-1)
             window_index = window_index[window_index != pad_values]
             window_indices.append(window_index + start_window_index)
+            cu_seqlens_within_windows.append(window_seqlens.cumsum(0) + start_window_index)
             start_window_index += t * h * w
         window_indices = torch.concat(window_indices, dim=0)
         cu_seqlens_within_windows = torch.concat(cu_seqlens_within_windows, dim=0)
+        cu_seqlens_within_windows = F.pad(cu_seqlens_within_windows, (1, 0), value=0).to(torch.int32)
         return window_indices, cu_seqlens_within_windows
     # Ignore copy
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         cu_seqlens: Optional[List[torch.Tensor]] = None,
+        image_grid_thw: Optional[List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]] = None,
         height_position_ids: Optional[torch.Tensor] = None,
         width_position_ids: Optional[torch.Tensor] = None,
         use_rope: Optional[bool] = False,
         vision_or_text = "vision"
         assert vision_or_text in ["vision", "text"]
+        use_window_attn = (window_size > 0 and vision_or_text == "vision")
         use_rope = (use_rope is True) and (vision_or_text == "vision")
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         encoder_states = () if output_hidden_states else None
         device = inputs_embeds.device
         hidden_states = inputs_embeds
+        attention_mask = attention_mask.to(inputs_embeds.dtype) if attention_mask is not None else None
         if use_rope is True:
             flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+            assert sum([np.prod(x) for x in flatten_image_grid_thw]) == hidden_states.shape[1], (flatten_image_grid_thw, hidden_states.shape)
             if width_position_ids is None or height_position_ids is None:
                 split_hids = list()
                     split_wids.append(sample_wids)
                 width_position_ids = torch.concat(split_wids, dim=0)
                 height_position_ids = torch.concat(split_hids, dim=0)
             window_indices, cu_seqlens_within_windows = None, None
             if use_window_attn:
+                window_indices, cu_seqlens_within_windows = self.build_window_index(flatten_image_grid_thw, window_size, device)
                 reversed_window_indices = window_indices.argsort()
                 height_position_ids = height_position_ids[window_indices]
                 width_position_ids = width_position_ids[window_indices]
             rope_emb = None
             window_indices, cu_seqlens_within_windows = None, None
             if use_window_attn:
                 flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+                assert sum([np.prod(x) for x in flatten_image_grid_thw]) == hidden_states.shape[1], (flatten_image_grid_thw, hidden_states.shape)
+                window_indices, cu_seqlens_within_windows = self.build_window_index(flatten_image_grid_thw, window_size, device)
                 reversed_window_indices = window_indices.argsort()
         if use_window_attn:
         for encoder_layer in self.layers:
             if output_hidden_states:
+                encoder_states = encoder_states + ((hidden_states[:, reversed_window_indices, :],) if use_window_attn else (hidden_states, ))
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     encoder_layer.__call__,
         self.embeddings = SiglipVisionEmbeddings(config)
         self.encoder = SiglipEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head
         if self.use_head:
             self.head = SiglipMultiheadAttentionPoolingHead(config)
     # @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=KeyeVisionConfig)
     def forward(
         self,
         pixel_values,
         cu_seqlens: Optional[List[torch.Tensor]] = None,
         padding_mask: Optional[torch.Tensor] = None,
         vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]] = None,
         return_pooler_output: Optional[bool] = True,
         use_rope: Optional[bool] = False,
         window_size: Optional[bool] = -1,
         Returns:
         """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
             position_ids=position_ids,
+            image_grid_thw=image_grid_thw
         )
         encoder_outputs: BaseModelOutput = self.encoder(
                     token_indices = (sample_index == sample_idx).nonzero().flatten()
                     sample_hidden_state = hidden_state[token_indices]
                     sample_hidden_state_list.append(sample_hidden_state)
                 if not vision_return_embed_list:
+                    max_length = max([_state.shape[0] for _state in sample_hidden_state_list])
                     tmp_sample_hidden_state_list = list()
                     padding_mask = list()
                     for idx, _state in enumerate(sample_hidden_state_list):
                         padding_length = max_length - _state.shape[0]
+                        mask = _state.new_zeros(size=(max_length, ), dtype=torch.int64)
+                        mask[-padding_length: ] = 1
                         padding_mask.append(mask)
                         padding = _state.new_zeros(size=(padding_length, dim))
                         new_state = torch.concat([_state, padding], dim=0)
                         tmp_sample_hidden_state_list.append(new_state)
+                    sample_hidden_state = torch.stack(tmp_sample_hidden_state_list, dim=0)
+                    padding_mask = torch.stack(padding_mask, dim=0).float().to(last_hidden_state.dtype)
+                    pooler_output = self.head(sample_hidden_state, key_padding_mask=padding_mask)
                 else:
                     pooler_output = list()
                     for state in sample_hidden_state_list:
                 hidden_states=encoder_outputs.hidden_states,
                 attentions=encoder_outputs.attentions,
             )
         sample_hidden_state = list()
         assert cu_seqlens is not None
         for i in range(cu_seqlens.shape[0] - 1):
             start = cu_seqlens[i]
             end = cu_seqlens[i + 1]
+            tensor = last_hidden_state[:, start: end, :].squeeze(0)
             sample_hidden_state.append(tensor)
         return BaseModelOutputWithPooling(
             last_hidden_state=sample_hidden_state,
             pooler_output=None,
         super().__init__()
         self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(config)
         batch_size = hidden_state.shape[0]
         probe = self.probe.repeat(batch_size, 1, 1)
+        hidden_state = self.attention(probe, hidden_state, hidden_state, key_padding_mask=key_padding_mask)[0]
         residual = hidden_state
         hidden_state = self.layernorm(hidden_state)
     # @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=KeyeVisionConfig)
     def forward(
         self,
         pixel_values,
         interpolate_pos_encoding: bool = False,
         position_ids: Optional[torch.Tensor] = None,
         vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]] = None,
         cu_seqlens: Optional[List[torch.Tensor]] = None,
         return_pooler_output: Optional[bool] = True,
         use_rope: Optional[bool] = False,
         )
 class Qwen3RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
     return q_embed, k_embed
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     k_embed = k_embed.to(orig_k_dtype)
     return q_embed, k_embed
 Keye_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     config_class = KeyeConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["KeyeDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
                 module.weight.data[module.padding_idx].zero_()
 class SigLIPRotaryEmbedding(nn.Module):
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
         self.rope_init()
     def rope_init(self):
+        inv_freq = 1.0 / (self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
     def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(seq, self.inv_freq)
         return freqs
         else:
             # BC: "rope_type" was originally "type"
             if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
             else:
                 self.rope_type = "default"
             self.max_seq_len_cached = config.max_position_embeddings
             self.original_max_seq_len = config.max_position_embeddings
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
             self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
             inv_freq, self.attention_scaling = self.rope_init_fn(
                 self.config, device, seq_len=seq_len, **self.rope_kwargs
             )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
         # Core RoPE block. In contrast to other models, Keye has different position ids for the grids
         # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
         # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
         device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos()
             sin = emb.sin()
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
     mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
         self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.is_causal = True
         self.attention_dropout = config.attention_dropout
         self.rope_scaling = config.rope_scaling
         self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
         )
         self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
+        self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
         self.rotary_emb = KeyeRotaryEmbedding(config=config)
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_norm(self.q_proj(hidden_states).view(bsz, q_len, -1, self.head_dim))
+        key_states = self.k_norm(self.k_proj(hidden_states).view(bsz, q_len, -1, self.head_dim))
         value_states = self.v_proj(hidden_states)
         query_states = query_states.transpose(1, 2)
         )
         if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         # Fix precision issues in float16 inference
         # Replace inf values with zeros in attention weights to prevent NaN propagation
         if query_states.dtype == torch.float16:
+            attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
         # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
         cu_seqlens: Optional[torch.Tensor] = None,
+        sliding_window = -1,
         **kwargs,
     ):
         bsz, q_len, _ = hidden_states.size()
+        q= self.q_proj(hidden_states).view(bsz, q_len, -1, self.head_dim)
         query_states = self.q_norm(q)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(bsz, q_len, -1, self.head_dim))
         value_states = self.v_proj(hidden_states)
         query_states = query_states.transpose(1, 2)
         )
         if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
                 max_seqlen,
                 dropout_p=dropout_rate,
                 window_size=(sliding_window, sliding_window),
+                causal=self.is_causal
             )
         else:
             attn_output = _flash_attention_forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
         bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_norm(self.q_proj(hidden_states).view(bsz, q_len, -1, self.head_dim))
+        key_states = self.k_norm(self.k_proj(hidden_states).view(bsz, q_len, -1, self.head_dim))
         value_states = self.v_proj(hidden_states)
         query_states = query_states.transpose(1, 2)
         )
         if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         return attn_output, None, past_key_value
 QWEN3_ATTENTION_CLASSES = {
     "eager": KeyeAttention,
     "flash_attention_2": KeyeFlashAttention2,
     def __init__(self, config: KeyeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered."
             )
+        self.self_attn = QWEN3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.mlp = Qwen3MLP(config)
         self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
         **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
+            **kwargs
         )
         hidden_states = residual + hidden_states
         if output_attentions:
             outputs += (self_attn_weights,)
         if use_cache:
             outputs += (present_key_value,)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
+            [KeyeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
         self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **kwargs
     ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if self.gradient_checkpointing and self.training:
             if use_cache:
             inputs_embeds = self.embed_tokens(input_ids)
         if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
         # the hard coded `3` is for temporal, height and width.
         if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
         elif position_ids.dim() == 2:
             position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
         causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
         )
         hidden_states = inputs_embeds
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
                 if is_padding_right:
                     raise ValueError(
                         "You are attempting to perform batched generation with padding_side='right'"
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
         # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
         using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
         return causal_mask
         else:
             min_dtype = torch.finfo(dtype).min
             causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
             if config.sliding_window is not None:
                 # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                 # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
                     diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
             causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 if attention_mask.shape[-1] > target_length:
                     attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
                 padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
         return causal_mask
         # Initialize weights and apply final processing
         self.post_init()
     def get_input_embeddings(self):
         return self.model.embed_tokens
         video_token_id = self.config.video_token_id
         vision_start_token_id = self.config.vision_start_token_id
         mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
             total_input_ids = input_ids
             if attention_mask is None:
                 attention_mask = torch.ones_like(total_input_ids)
             for i, input_ids in enumerate(total_input_ids):
                 input_ids = input_ids[attention_mask[i] == 1]
                 image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
                 vision_tokens = input_ids[vision_start_indices + 1]
                 image_nums = (vision_tokens == image_token_id).sum()
                 video_nums = (vision_tokens == video_token_id).sum()
                     )
                     text_len = ed - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    if torch.is_tensor(second_per_grid_t): second_per_grid_t = second_per_grid_t.detach().item()
                     range_tensor = torch.arange(llm_grid_t).view(-1, 1)
                     expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+                    time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
                     time_tensor_long = time_tensor.long()
                     t_index = time_tensor_long.flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
                     st = ed + llm_grid_t * llm_grid_h * llm_grid_w
                 if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                     text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
                 llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
             return position_ids, mrope_position_deltas
         else:
             if attention_mask is not None:
                 position_ids = attention_mask.long().cumsum(-1) - 1
                 position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
                 mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
             else:
                 position_ids = (
             return position_ids, mrope_position_deltas
+    @replace_return_docstrings(output_type=KeyeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         rope_deltas: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         second_per_grid_ts: Optional[torch.Tensor] = None,
+        **kwargs
     ) -> Union[Tuple, KeyeCausalLMOutputWithPast]:
         r"""
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
         "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
         ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if inputs_embeds is None:
             inputs_embeds = self.model.embed_tokens(input_ids)
                     image_grid_hws.append(thw_tuple)
                     image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
                     siglip_position_ids.append(image_position_ids)
+                    sample_indices.append(torch.full((numel, ), idx, dtype=torch.int64))
                     cu_seqlens.append(cu_seqlens[-1] + numel)
+                siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(pixel_values.device)
+                cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(pixel_values.device)
+                sample_indices = torch.concat(sample_indices, dim=0).to(pixel_values.device)
                 vision_outputs = self.visual(
+                    pixel_values=pixel_values,
                     image_grid_thw=image_grid_hws,
                     position_ids=siglip_position_ids,
                     vision_return_embed_list=True,
                     cu_seqlens=cu_seqlens,
                     return_pooler_output=False,
                     use_rope=True,
+                    window_size =-1,
                 )
                 image_embeds = vision_outputs.last_hidden_state
                 image_embeds = self.mlp_AR(image_embeds, image_grid_thw)
                 n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+                #image_embeds is a list of tensor, each tensor is a image feature,I want to concat them all into a tensor
+                image_embeds = torch.cat(image_embeds,dim=0)
                 n_image_features = image_embeds.shape[0]
                 if n_image_tokens != n_image_features:
                     raise ValueError(
                         f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                     )
+                mask = (input_ids == self.config.image_token_id)
                 mask_unsqueezed = mask.unsqueeze(-1)
                 mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
                 image_mask = mask_expanded.to(inputs_embeds.device)
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
                     video_grid_hws.append(thw_tuple)
                     video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
                     siglip_position_ids.append(video_position_ids)
+                    sample_indices.append(torch.full((numel, ), idx, dtype=torch.int64))
                     cu_seqlens.append(cu_seqlens[-1] + numel)
+                siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(pixel_values_videos.device)
+                cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(pixel_values_videos.device)
+                sample_indices = torch.concat(sample_indices, dim=0).to(pixel_values_videos.device)
                 vision_outputs = self.visual(
+                    pixel_values=pixel_values_videos,
                     image_grid_thw=video_grid_hws,
                     position_ids=siglip_position_ids,
                     vision_return_embed_list=True,
                     cu_seqlens=cu_seqlens,
                     return_pooler_output=False,
                     use_rope=True,
+                    window_size = -1,
                 )
                 video_embeds = vision_outputs.last_hidden_state
                 video_embeds = self.mlp_AR(video_embeds, video_grid_thw)
                 n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
+                video_embeds = torch.cat(video_embeds,dim=0)
                 n_video_features = video_embeds.shape[0]
                 if n_video_tokens != n_video_features:
                     raise ValueError(
                 mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
                 video_mask = mask_expanded.to(inputs_embeds.device)
+                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
         # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
+        if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
             # calculate RoPE index once per generation in the pre-fill stage only
             if (
                 (cache_position is not None and cache_position[0] == 0)
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            **kwargs
         )
         hidden_states = outputs[0]
         if expand_size == 1:
             return input_ids, model_kwargs
+        visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
         def _expand_dict_for_generation_visual(dict_to_expand):
             image_grid_thw = model_kwargs.get("image_grid_thw", None)
             def _repeat_interleave_samples(x, lengths, repeat_times):
                 samples = torch.split(x, lengths)
                 repeat_args = [repeat_times] + [1] * (x.dim() - 1)
+                result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
                 return result
             for key in dict_to_expand:
                         )
                     tensor = torch.tensor(dict_to_expand[key])
                     lengths = list(video_nums)
+                    tensor = _repeat_interleave_samples(tensor, lengths=lengths, repeat_times=expand_size)
                     dict_to_expand[key] = tensor.tolist()
             return dict_to_expand
                     and isinstance(dict_to_expand[key], torch.Tensor)
                     and key not in visual_keys
                 ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
             return dict_to_expand
         # input_ids is required for expanding visual inputs
         if is_encoder_decoder:
             if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
         return input_ids, model_kwargs