microsoft
/

Phi-4-mini-flash-reasoning

@@ -573,7 +573,7 @@ class SambaYFlashAttention2(SambaYAttention):
             key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
             value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            use_sliding_windows = self.config.sliding_window is not None and self.config.layer_types[self.layer_idx] is not None
             if past_key_value is not None:
@@ -710,8 +710,8 @@ class SambaYFlashAttention2(SambaYAttention):
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
-                        self.config.layer_types[self.layer_idx] -1,
-                        self.config.layer_types[self.layer_idx] -1,
                     ),
                 )
@@ -735,8 +735,8 @@ class SambaYFlashAttention2(SambaYAttention):
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
-                        self.config.layer_types[self.layer_idx] -1,
-                        self.config.layer_types[self.layer_idx] -1,
                     ),
                 )
@@ -1085,9 +1085,9 @@ class SambaYDecoderLayer(nn.Module):
             residual = residual.to(torch.float32)
             self_attn_weights = None
         else:
-            if self.config.sliding_window is not None and self.config.layer_types[self.layer_idx] is not None and attention_mask is not None:  # efficient SDPA and no padding
                 if past_key_value is not None and cache_position[0] > 0:  # when decoding
-                    attention_mask = attention_mask[:, -self.config.layer_types[self.layer_idx]:]
             #hidden_states = self.input_layernorm2(hidden_states.to(dtype=self.input_layernorm2.weight.dtype))
             # Self Attention
             attn_outputs, self_attn_weights, yoco_key_values = self.attn(

             key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
             value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            use_sliding_windows = self.config.sliding_window is not None and self.config.layer_types[self.layer_idx] == "sliding_attention"
             if past_key_value is not None:
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
+                        self.config.sliding_window -1,
+                        self.config.sliding_window -1,
                     ),
                 )
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
+                        self.config.sliding_window -1,
+                        self.config.sliding_window -1,
                     ),
                 )
             residual = residual.to(torch.float32)
             self_attn_weights = None
         else:
+            if self.config.sliding_window is not None and self.config.layer_types[self.layer_idx] == "sliding_attention" and attention_mask is not None:  # efficient SDPA and no padding
                 if past_key_value is not None and cache_position[0] > 0:  # when decoding
+                    attention_mask = attention_mask[:, -self.config.sliding_window:]
             #hidden_states = self.input_layernorm2(hidden_states.to(dtype=self.input_layernorm2.weight.dtype))
             # Self Attention
             attn_outputs, self_attn_weights, yoco_key_values = self.attn(