mooncast
/

text2semantic

Safetensors

moonshot

custom_code

Model card Files Files and versions

xet

Community

mrfakename commited on Apr 4

Commit

ac2566f

verified ·

1 Parent(s): e2c50c8

Update modeling_moonshot.py

Browse files

Files changed (1) hide show

modeling_moonshot.py +36 -12

modeling_moonshot.py CHANGED Viewed

@@ -427,21 +427,27 @@ class Attention(nn.Module):
         self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
         # Standard scaled dot-product attention
-        batch_size, q_length, num_heads, head_dim = query_states.shape
-        _, kv_length, num_kv_heads, _ = key_states.shape
-        # Handle grouped-query attention by repeating k/v heads if necessary
-        if num_kv_heads != num_heads:
-            # Each query head uses the corresponding key-value head (num_heads // num_kv_heads) times
-            key_states = repeat_kv(key_states, self.num_key_value_groups)
-            value_states = repeat_kv(value_states, self.num_key_value_groups)
-        # Prepare for attention computation (batch_size, num_heads, seq_length, head_dim)
         query_states = query_states.transpose(1, 2)
         key_states = key_states.transpose(1, 2)
         value_states = value_states.transpose(1, 2)
-        # (batch_size, num_heads, query_length, key_length)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
         if softmax_scale is None:
@@ -449,15 +455,33 @@ class Attention(nn.Module):
         attn_weights = attn_weights * softmax_scale
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
         # Apply softmax and dropout
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=self.training)
-        # Context vectors
         attn_output = torch.matmul(attn_weights, value_states)
         attn_output = attn_output.transpose(1, 2).contiguous()
         return attn_output

         self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
         # Standard scaled dot-product attention
+        batch_size, q_len, num_heads, head_dim = query_states.shape
+        bsz, kv_seq_len, num_kv_heads, _ = key_states.shape
+        # Transpose query states for matmul: (batch_size, num_heads, q_len, head_dim)
         query_states = query_states.transpose(1, 2)
+        # Transpose key/value states for repeat_kv: (batch_size, num_kv_heads, kv_seq_len, head_dim)
         key_states = key_states.transpose(1, 2)
         value_states = value_states.transpose(1, 2)
+        # Handle grouped-query attention by repeating k/v heads if necessary
+        # repeat_kv expects (batch, num_key_value_heads, slen, head_dim)
+        # repeat_kv outputs (batch, num_attention_heads, slen, head_dim)
+        if self.num_key_value_groups > 1:
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # key_states is now (batch_size, num_heads, kv_seq_len, head_dim)
+        # value_states is now (batch_size, num_heads, kv_seq_len, head_dim)
+        # Attention score calculation: (batch_size, num_heads, q_len, kv_seq_len)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
         if softmax_scale is None:
         attn_weights = attn_weights * softmax_scale
         if attention_mask is not None:
+            # The attention mask passed from _flash_attention_forward is the padding_mask
+            # which is (batch_size, seq_len). We need the causal mask prepared in the main forward pass.
+            # This part needs adjustment depending on how the causal mask is passed.
+            # For now, assuming the correct mask is passed somehow.
+            # If attention_mask is the padding mask, it needs expanding and causal masking added.
+            # This standard attention path currently doesn't receive the full causal mask.
+            # Let's log a warning for now as this mask handling is likely incorrect
+            # compared to the original Llama attention or FlashAttention's causal=True
+            logger.warning_once(
+                "Standard attention mask handling might be incomplete. "
+                "Ensure the correct causal mask is being used if not using Flash Attention."
+            )
+            # Assuming attention_mask is already the correct shape [bsz, 1, q_len, kv_seq_len]
+            # If it's the padding mask [bsz, kv_seq_len], it needs expansion + causal.
             attn_weights = attn_weights + attention_mask
         # Apply softmax and dropout
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=self.training)
+        # Context vectors: (batch_size, num_heads, q_len, head_dim)
         attn_output = torch.matmul(attn_weights, value_states)
+        # Reshape to original format: (batch_size, num_heads, q_len, head_dim) -> (batch_size, q_len, hidden_size)
         attn_output = attn_output.transpose(1, 2).contiguous()
+        # attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) # This reshape happens outside this function
         return attn_output