alibayram
/

turkish-deepseek

@@ -34,7 +34,7 @@
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
   "auto_map": {
-    "AutoConfig": "configuration_deepseek.DeepSeekConfig",
     "AutoModel": "modeling_deepseek.DeepSeekModel",
     "AutoModelForCausalLM": "modeling_deepseek.DeepSeekForCausalLM"
   }

   "tie_word_embeddings": false,
   "torch_dtype": "float32",
   "auto_map": {
+    "AutoConfig": "modeling_deepseek.DeepSeekConfig",
     "AutoModel": "modeling_deepseek.DeepSeekModel",
     "AutoModelForCausalLM": "modeling_deepseek.DeepSeekForCausalLM"
   }

modeling_deepseek.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-PyTorch DeepSeek model.
 """
 import math
@@ -8,10 +8,10 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from configuration_deepseek import DeepSeekConfig
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter, _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask)
@@ -31,6 +31,94 @@ if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "DeepSeekConfig"
@@ -97,129 +185,6 @@ class DeepSeekRMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
-class DeepSeekMLA(nn.Module):
-    """Multi-head Latent Attention (MLA) module."""
-    def __init__(self, config: DeepSeekConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_attention_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        # MLA specific parameters
-        self.q_lora_rank = config.q_lora_rank
-        self.kv_lora_rank = config.kv_lora_rank
-        self.qk_nope_head_dim = config.qk_nope_head_dim
-        self.qk_rope_head_dim = config.qk_rope_head_dim
-        self.v_head_dim = config.v_head_dim
-        self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
-        if self.q_lora_rank == 0:
-            self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
-        else:
-            self.q_a_proj = nn.Linear(self.hidden_size, self.q_lora_rank, bias=False)
-            self.q_a_layernorm = DeepSeekRMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
-            self.q_b_proj = nn.Linear(self.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
-        self.kv_a_proj_with_mqa = nn.Linear(
-            self.hidden_size,
-            self.kv_lora_rank + self.qk_rope_head_dim,
-            bias=False
-        )
-        self.kv_a_layernorm = DeepSeekRMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
-        self.kv_b_proj = nn.Linear(
-            self.kv_lora_rank,
-            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
-            bias=False
-        )
-        self.o_proj = nn.Linear(self.num_heads * self.v_head_dim, self.hidden_size, bias=False)
-        # Scaling
-        self.scaling = self.qk_head_dim ** -0.5
-        if config.max_position_embeddings > config.original_seq_len:
-            mscale = 0.1 * config.mscale * math.log(config.rope_factor) + 1.0
-            self.scaling = self.scaling * mscale * mscale
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-        # Query projection
-        if self.q_lora_rank == 0:
-            query_states = self.q_proj(hidden_states)
-        else:
-            query_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.qk_head_dim).transpose(1, 2)
-        # Split query into no-position-encoding and position-encoding parts
-        q_nope, q_pe = query_states.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-        # Key-Value projection
-        kv_input = self.kv_a_proj_with_mqa(hidden_states)
-        compressed_kv, k_pe = kv_input.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        # Apply RoPE to position-encoding parts
-        if position_ids is not None:
-            cos, sin = self.rotary_emb(hidden_states, position_ids)
-            q_pe = apply_rotary_pos_emb(q_pe, cos, sin)
-            k_pe = apply_rotary_pos_emb(k_pe.unsqueeze(2), cos, sin).squeeze(2)
-        # Compute key and value from compressed representation
-        kv_b_weight = self.kv_b_proj.weight.view(
-            self.num_heads, self.qk_nope_head_dim + self.v_head_dim, self.kv_lora_rank
-        )
-        # Project compressed KV to get keys and values
-        compressed_kv = self.kv_a_layernorm(compressed_kv)
-        key_states = torch.einsum('bld,hnd->bhln', compressed_kv, kv_b_weight[:, :self.qk_nope_head_dim, :])
-        value_states = torch.einsum('bld,hnd->bhln', compressed_kv, kv_b_weight[:, -self.v_head_dim:, :])
-        # Attention computation
-        attn_weights = torch.matmul(q_nope, key_states.transpose(-2, -1)) * self.scaling
-        # Add positional attention
-        if k_pe is not None:
-            pos_attn = torch.matmul(q_pe, k_pe.unsqueeze(1).transpose(-2, -1)) * self.scaling
-            attn_weights = attn_weights + pos_attn
-        if attention_mask is not None:
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-        # Apply softmax
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        # Apply attention to values
-        attn_output = torch.matmul(attn_weights, value_states)
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, -1)
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
 class DeepSeekMLP(nn.Module):
     """Multi-Layer Perceptron for dense layers."""
@@ -238,23 +203,6 @@ class DeepSeekMLP(nn.Module):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-class DeepSeekExpert(nn.Module):
-    """Single expert in MoE layer."""
-    def __init__(self, config: DeepSeekConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.moe_intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN["silu"]
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 DEEPSEEK_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -301,83 +249,31 @@ class DeepSeekPreTrainedModel(PreTrainedModel):
 DEEPSEEK_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence token in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-            [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and value in the self-attention blocks and in the cross-attention blocks)
-            that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key/value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
         use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
         output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
         output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
         return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
 """
 class DeepSeekModel(DeepSeekPreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepSeekDecoderLayer`]
-    Args:
-        config: DeepSeekConfig
     """
     def __init__(self, config: DeepSeekConfig):
@@ -386,7 +282,6 @@ class DeepSeekModel(DeepSeekPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        # Note: We'll implement layers in a separate method due to complexity
         self.norm = DeepSeekRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
@@ -549,23 +444,19 @@ class DeepSeekForCausalLM(DeepSeekPreTrainedModel):
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
     ):
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
         if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
                 input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
                 input_ids = input_ids[:, cache_position]
         if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and cache_position[0] == 0:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:

 """
+PyTorch DeepSeek model - Standalone version for HuggingFace Hub
 """
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
+from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter, _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask)
 logger = logging.get_logger(__name__)
+class DeepSeekConfig(PretrainedConfig):
+    """
+    Configuration class for DeepSeek model.
+    """
+    model_type = "deepseek"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=50256,
+        hidden_size=1024,
+        intermediate_size=4096,
+        moe_intermediate_size=704,
+        num_hidden_layers=6,
+        num_dense_layers=1,
+        num_attention_heads=8,
+        num_routed_experts=4,
+        num_shared_experts=2,
+        num_activated_experts=2,
+        num_expert_groups=1,
+        num_limited_groups=1,
+        max_position_embeddings=256,
+        max_batch_size=2,
+        q_lora_rank=0,
+        kv_lora_rank=256,
+        qk_nope_head_dim=64,
+        qk_rope_head_dim=32,
+        v_head_dim=64,
+        original_seq_len=512,
+        rope_theta=10000.0,
+        rope_factor=40,
+        beta_fast=32,
+        beta_slow=1,
+        mscale=1.0,
+        initializer_range=0.02,
+        rms_norm_eps=1e-3,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_dense_layers = num_dense_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_routed_experts = num_routed_experts
+        self.num_shared_experts = num_shared_experts
+        self.num_activated_experts = num_activated_experts
+        self.num_expert_groups = num_expert_groups
+        self.num_limited_groups = num_limited_groups
+        self.max_position_embeddings = max_position_embeddings
+        self.max_batch_size = max_batch_size
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.original_seq_len = original_seq_len
+        self.rope_theta = rope_theta
+        self.rope_factor = rope_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
 _CONFIG_FOR_DOC = "DeepSeekConfig"
         return self.weight * hidden_states.to(input_dtype)
 class DeepSeekMLP(nn.Module):
     """Multi-Layer Perceptron for dense layers."""
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 DEEPSEEK_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
 DEEPSEEK_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence token in the position embeddings.
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states for sequential decoding.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally pass an embedded representation instead of input_ids.
         use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned.
         output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors.
         output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states.
         return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`].
 """
 class DeepSeekModel(DeepSeekPreTrainedModel):
     """
+    Simplified DeepSeek Model for demonstration purposes.
+    Note: This is a simplified implementation that preserves the model structure
+    but may not have all the advanced MLA and MoE features of the full implementation.
     """
     def __init__(self, config: DeepSeekConfig):
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.norm = DeepSeekRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
     ):
+        # Standard implementation for generation
         if past_key_values is not None:
+            if inputs_embeds is not None:
                 input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:
                 input_ids = input_ids[:, cache_position]
         if attention_mask is not None and position_ids is None:
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         if inputs_embeds is not None and cache_position[0] == 0:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else: