Upload DogeForCausalLM

Browse files

Files changed (5) hide show

config.json +44 -43
configuration_doge.py +8 -0
generation_config.json +7 -7
model.safetensors +2 -2
modeling_doge.py +125 -29

config.json CHANGED Viewed

@@ -1,43 +1,44 @@
-{
-  "_name_or_path": "./results/Doge-60M",
-  "architectures": [
-    "DogeForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoConfig": "configuration_doge.DogeConfig",
-    "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
-  },
-  "bos_token_id": 0,
-  "eos_token_id": 1,
-  "expert_retrieval_size": 256,
-  "hidden_act": "silu",
-  "hidden_bias": false,
-  "hidden_dropout": 0.0,
-  "hidden_size": 512,
-  "initializer_range": 0.02,
-  "intermediate_size": 1024,
-  "is_moe": false,
-  "max_position_embeddings": 2048,
-  "model_type": "doge",
-  "num_attention_heads": 4,
-  "num_cdmmoe_experts": 2048,
-  "num_cdmmoe_experts_per_head": 8,
-  "num_cdmmoe_heads": 4,
-  "num_channels": 3,
-  "num_hidden_layers": 16,
-  "num_key_value_heads": 2,
-  "pad_token_id": 2,
-  "patch_size": 16,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": {
-    "factor": 4.0,
-    "original_max_position_embeddings": 2048,
-    "rope_type": "dynamic"
-  },
-  "rope_theta": 10000.0,
-  "torch_dtype": "float32",
-  "transformers_version": "4.46.1",
-  "use_cache": true,
-  "vocab_size": 32768
-}

+{
+  "_name_or_path": "./results/Doge-60M",
+  "architectures": [
+    "DogeForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_doge.DogeConfig",
+    "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
+  },
+  "bos_token_id": 0,
+  "dynamic_mask_ratio": 0.0,
+  "eos_token_id": 1,
+  "expert_retrieval_size": 256,
+  "hidden_act": "silu",
+  "hidden_bias": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 1024,
+  "is_moe": false,
+  "max_position_embeddings": 2048,
+  "model_type": "doge",
+  "num_attention_heads": 4,
+  "num_cdmmoe_experts": 2048,
+  "num_cdmmoe_experts_per_head": 8,
+  "num_cdmmoe_heads": 4,
+  "num_channels": 3,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 2,
+  "pad_token_id": 2,
+  "patch_size": 16,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "factor": 4.0,
+    "original_max_position_embeddings": 2048,
+    "rope_type": "dynamic"
+  },
+  "rope_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32768
+}

configuration_doge.py CHANGED Viewed

@@ -111,6 +111,8 @@ class DogeConfig(PretrainedConfig):
             If it is not specified, will default to `num_attention_heads`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
         num_cdmmoe_experts (`int`, *optional*, defaults to 2048):
@@ -154,6 +156,7 @@ class DogeConfig(PretrainedConfig):
         num_attention_heads=8,
         num_key_value_heads=None,
         attention_dropout=0.0,
         is_moe=False,
         num_cdmmoe_experts=2048,
         num_cdmmoe_heads=4,
@@ -183,6 +186,7 @@ class DogeConfig(PretrainedConfig):
         self.num_attention_heads = num_attention_heads
         self.num_key_value_heads = num_key_value_heads
         self.attention_dropout = attention_dropout
         self.is_moe = is_moe
         self.num_cdmmoe_experts = num_cdmmoe_experts
         self.num_cdmmoe_heads = num_cdmmoe_heads
@@ -195,6 +199,10 @@ class DogeConfig(PretrainedConfig):
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
         super().__init__(
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,

             If it is not specified, will default to `num_attention_heads`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        dynamic_mask_ratio (`float`, *optional*, defaults to 0.0, range [0, 1]):
+            The ratio to control the proportion of the dynamic mask filled with the minimum value.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
         num_cdmmoe_experts (`int`, *optional*, defaults to 2048):
         num_attention_heads=8,
         num_key_value_heads=None,
         attention_dropout=0.0,
+        dynamic_mask_ratio=0.0,
         is_moe=False,
         num_cdmmoe_experts=2048,
         num_cdmmoe_heads=4,
         self.num_attention_heads = num_attention_heads
         self.num_key_value_heads = num_key_value_heads
         self.attention_dropout = attention_dropout
+        self.dynamic_mask_ratio = dynamic_mask_ratio
         self.is_moe = is_moe
         self.num_cdmmoe_experts = num_cdmmoe_experts
         self.num_cdmmoe_heads = num_cdmmoe_heads
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
+        # for backward compatibility
+        if num_key_value_heads is None:
+            self.num_key_value_heads = num_attention_heads
         super().__init__(
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,

generation_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 0,
-  "eos_token_id": 1,
-  "pad_token_id": 2,
-  "transformers_version": "4.46.1"
-}

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 2,
+  "transformers_version": "4.49.0.dev0"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6ff7db0f6721882934053a9c20eec73c33b55fc47ef428e20a0e91391738985
-size 218391112

 version https://git-lfs.github.com/spec/v1
+oid sha256:550dbbf30bc9f8b88c7ac4136a1412414be8db29a5146b9f0bab2e795ab991e5
+size 218325576

modeling_doge.py CHANGED Viewed

@@ -22,6 +22,7 @@ import math
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
@@ -216,14 +217,15 @@ class DogeDynamicMaskAttention(nn.Module):
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.attention_dropout = config.attention_dropout
         # Q K V O projections
         self.q_proj = nn.Linear(self.hidden_dim, self.num_heads * self.head_dim, bias=config.hidden_bias)
         self.k_proj = nn.Linear(self.hidden_dim, self.num_key_value_heads * self.head_dim, bias=config.hidden_bias)
         # dynamic mask for the QK^T attention score matrix
         self.A = nn.Parameter(torch.ones(self.num_heads))
-        self.dt_proj = nn.Linear(self.hidden_dim, self.num_heads, bias=config.hidden_bias)
-        self.v_proj = nn.Linear(self.hidden_dim, self.num_key_value_heads * self.head_dim, bias=config.hidden_bias)
         self.o_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=config.hidden_bias)
     def forward(
@@ -254,6 +256,10 @@ class DogeDynamicMaskAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         # repeat key and value states
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -262,12 +268,13 @@ class DogeDynamicMaskAttention(nn.Module):
         attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.head_dim)
         # add mask to attention scores
-        if attention_mask is not None:
-            dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
-            dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
-            dynamic_mask = dynamic_mask < 1.0
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]].masked_fill(dynamic_mask[:, :, None, :], torch.finfo(hidden_states.dtype).min)
-            attn_weights = attn_weights + causal_mask
         # upcast attention scores to fp32
         attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -282,8 +289,37 @@ class DogeDynamicMaskAttention(nn.Module):
         return attn_output, past_key_value
-class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
     def forward(
         self,
@@ -312,34 +348,31 @@ class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # repeat key and value states
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
-            dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
-            dynamic_mask = dynamic_mask < 1.0
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]].masked_fill(dynamic_mask[:, :, None, :], torch.finfo(hidden_states.dtype).min)
         query_states = query_states.contiguous()
         key_states = key_states.contiguous()
         value_states = value_states.contiguous()
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if causal_mask is None and q_len > 1 else False
         # NOTE: As of pytorch 2.5.1, cuDNN's SDPA backward pass is still incorrect, so we disable cuDNN SDPA (see https://github.com/pytorch/pytorch/issues/138581)
         torch.backends.cuda.enable_cudnn_sdp(False)
         attn_output = F.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
-            attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
@@ -349,9 +382,70 @@ class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
         return attn_output, past_key_value
 DOGE_ATTENTION_CLASSES = {
     "eager": DogeDynamicMaskAttention,
-    "sdpa": DogeSdpaDynamicMaskAttn,
 }
@@ -519,6 +613,7 @@ class DogePreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["DogeDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_sdpa = True
     _supports_cache_class = True
     _supports_quantized_cache = True
@@ -693,7 +788,7 @@ class DogeModel(DogePreTrainedModel):
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
-        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -877,7 +972,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -886,7 +981,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
-        **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -920,6 +1015,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
         )
         hidden_states = outputs[0]
@@ -929,7 +1025,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         loss = None
         if labels is not None:
-            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **loss_kwargs)
         if not return_dict:
             output = (logits,) + outputs[1:]

 from typing import List, Optional, Tuple, Union
 import torch
+from torch.nn.attention.flex_attention import flex_attention
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.attention_dropout = config.attention_dropout
+        self.dynamic_mask_ratio = config.dynamic_mask_ratio
         # Q K V O projections
         self.q_proj = nn.Linear(self.hidden_dim, self.num_heads * self.head_dim, bias=config.hidden_bias)
         self.k_proj = nn.Linear(self.hidden_dim, self.num_key_value_heads * self.head_dim, bias=config.hidden_bias)
+        self.v_proj = nn.Linear(self.hidden_dim, self.num_key_value_heads * self.head_dim, bias=config.hidden_bias)
         # dynamic mask for the QK^T attention score matrix
         self.A = nn.Parameter(torch.ones(self.num_heads))
+        self.dt_proj = nn.Linear(self.num_key_value_heads * self.head_dim, self.num_heads, bias=config.hidden_bias)
         self.o_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=config.hidden_bias)
     def forward(
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # calculate dynamic mask from value_states
+        dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
+        dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
         # repeat key and value states
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.head_dim)
         # add mask to attention scores
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dynamic_mask=dynamic_mask,
+            dynamic_mask_ratio=self.dynamic_mask_ratio,
+            attention_mask=attention_mask,
+        )
+        attn_weights = attn_weights + attn_mask
         # upcast attention scores to fp32
         attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         return attn_output, past_key_value
+    def prepare_dynamic_mask(
+        self,
+        hidden_states: torch.Tensor,
+        dynamic_mask: torch.Tensor,
+        dynamic_mask_ratio: float = 0.0,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Combine `dynamic_mask` with `attention_mask` to generate the final `attn_mask`.
+        Args:
+            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
+            dynamic_mask (`torch.Tensor`): dynamic mask of shape `(batch_size, num_heads, key_sequence_length)`.
+            dynamic_mask_ratio (`float`, *optional*): Ratio from 0.0 to 1.0 used to control the proportion of the dynamic mask filled with the minimum value.
+            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
+        """
+        min_type = torch.finfo(hidden_states.dtype).min
+        attn_mask = dynamic_mask[:, :, None, :]
+        if 0.0 < dynamic_mask_ratio < 1.0:
+            rate_value = torch.kthvalue(
+                attn_mask,
+                int(attn_mask.shape[-1] * dynamic_mask_ratio),
+                dim=-1, keepdim=True,
+            ).values
+            attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
+        if attention_mask is not None:
+            attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
+        return attn_mask
+class DogeSdpaDynamicMaskAttention(DogeDynamicMaskAttention):
     def forward(
         self,
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # calculate dynamic mask from value_states
+        dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
+        dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dynamic_mask=dynamic_mask,
+            dynamic_mask_ratio=self.dynamic_mask_ratio,
+            attention_mask=attention_mask,
+        )
         query_states = query_states.contiguous()
         key_states = key_states.contiguous()
         value_states = value_states.contiguous()
         # NOTE: As of pytorch 2.5.1, cuDNN's SDPA backward pass is still incorrect, so we disable cuDNN SDPA (see https://github.com/pytorch/pytorch/issues/138581)
         torch.backends.cuda.enable_cudnn_sdp(False)
         attn_output = F.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
+            attn_mask=attn_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
+            enable_gqa=True,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
         return attn_output, past_key_value
+class DogeFlexDynamicMaskAttention(DogeDynamicMaskAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[Cache]]:
+        bsz, q_len, _ = hidden_states.shape
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
+        dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dynamic_mask=dynamic_mask,
+            dynamic_mask_ratio=self.dynamic_mask_ratio,
+            attention_mask=attention_mask,
+        )
+        # TODO: flex_attention: Captured buffers that require grad are not yet supported.
+        # NOTE: So we only use flex_attention in inference mode.
+        def dynamic_mask_mod(score, batch, head, q_idx, kv_idx):
+            score = score + attn_mask[batch][head][q_idx][kv_idx]
+            return score
+        attn_output = flex_attention(
+            query_states,
+            key_states,
+            value_states,
+            score_mod=dynamic_mask_mod,
+            enable_gqa=True,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, past_key_value
 DOGE_ATTENTION_CLASSES = {
+    "flex_attention": DogeFlexDynamicMaskAttention,
     "eager": DogeDynamicMaskAttention,
+    "sdpa": DogeSdpaDynamicMaskAttention,
 }
     supports_gradient_checkpointing = True
     _no_split_modules = ["DogeDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
+    _supports_flex_attn = True
     _supports_sdpa = True
     _supports_cache_class = True
     _supports_quantized_cache = True
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
+        **kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            **kwargs,
         )
         hidden_states = outputs[0]
         loss = None
         if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
         if not return_dict:
             output = (logits,) + outputs[1:]