""" DeepSeek model configuration """ from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class DeepSeekConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`DeepSeekModel`]. It is used to instantiate a DeepSeek model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the DeepSeek-V3 [deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 50256): Vocabulary size of the DeepSeek model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DeepSeekModel`] hidden_size (`int`, *optional*, defaults to 1024): Dimension of the hidden representations. intermediate_size (`int`, *optional*, defaults to 4096): Dimension of the MLP representations for dense layers. moe_intermediate_size (`int`, *optional*, defaults to 704): Dimension of the MLP representations for MoE layers. num_hidden_layers (`int`, *optional*, defaults to 6): Number of hidden layers in the Transformer decoder. num_dense_layers (`int`, *optional*, defaults to 1): Number of dense (non-MoE) layers in the model. num_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer decoder. num_routed_experts (`int`, *optional*, defaults to 4): Number of routed experts in MoE layers. num_shared_experts (`int`, *optional*, defaults to 2): Number of shared experts in MoE layers. num_activated_experts (`int`, *optional*, defaults to 2): Number of experts activated per token in MoE layers. num_expert_groups (`int`, *optional*, defaults to 1): Number of expert groups in MoE layers. num_limited_groups (`int`, *optional*, defaults to 1): Number of limited groups in MoE layers. score_func (`str`, *optional*, defaults to `"softmax"`): Scoring function for expert selection. Can be "softmax" or "sigmoid". route_scale (`float`, *optional*, defaults to 1.0): Scaling factor for routing weights. q_lora_rank (`int`, *optional*, defaults to 0): Rank of LoRA adaptation for query projection. 0 means no LoRA. kv_lora_rank (`int`, *optional*, defaults to 256): Rank of LoRA adaptation for key-value projection. qk_nope_head_dim (`int`, *optional*, defaults to 64): Dimension of query-key heads without positional encoding. qk_rope_head_dim (`int`, *optional*, defaults to 32): Dimension of query-key heads with rotary positional encoding. v_head_dim (`int`, *optional*, defaults to 64): Dimension of value heads. original_seq_len (`int`, *optional*, defaults to 512): Original sequence length used during pretraining. rope_theta (`float`, *optional*, defaults to 10000.0): Base frequency for rotary positional encoding. rope_factor (`float`, *optional*, defaults to 40): Scaling factor for RoPE frequency adjustment. beta_fast (`int`, *optional*, defaults to 32): Fast beta parameter for YaRN RoPE scaling. beta_slow (`int`, *optional*, defaults to 1): Slow beta parameter for YaRN RoPE scaling. mscale (`float`, *optional*, defaults to 1.0): Scale factor for attention logits when using extended context. max_position_embeddings (`int`, *optional*, defaults to 256): The maximum sequence length that this model might ever be used with. max_batch_size (`int`, *optional*, defaults to 2): The maximum batch size that this model might ever be used with for caching. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. rms_norm_eps (`float`, *optional*, defaults to 1e-3): The epsilon used by the rms normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. pad_token_id (`int`, *optional*): The id of the padding token. bos_token_id (`int`, *optional*, defaults to 2): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 3): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings ```python >>> from transformers import DeepSeekModel, DeepSeekConfig >>> # Initializing a DeepSeek configuration >>> configuration = DeepSeekConfig() >>> # Initializing a model from the configuration >>> model = DeepSeekModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "deepseek" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=50256, hidden_size=1024, intermediate_size=4096, moe_intermediate_size=704, num_hidden_layers=6, num_dense_layers=1, num_attention_heads=8, num_routed_experts=4, num_shared_experts=2, num_activated_experts=2, num_expert_groups=1, num_limited_groups=1, score_func="softmax", route_scale=1.0, q_lora_rank=0, kv_lora_rank=256, qk_nope_head_dim=64, qk_rope_head_dim=32, v_head_dim=64, original_seq_len=512, rope_theta=10000.0, rope_factor=40, beta_fast=32, beta_slow=1, mscale=1.0, max_position_embeddings=256, max_batch_size=2, initializer_range=0.02, rms_norm_eps=1e-3, use_cache=True, pad_token_id=0, bos_token_id=2, eos_token_id=3, tie_word_embeddings=False, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.moe_intermediate_size = moe_intermediate_size self.num_hidden_layers = num_hidden_layers self.num_dense_layers = num_dense_layers self.num_attention_heads = num_attention_heads self.num_routed_experts = num_routed_experts self.num_shared_experts = num_shared_experts self.num_activated_experts = num_activated_experts self.num_expert_groups = num_expert_groups self.num_limited_groups = num_limited_groups self.score_func = score_func self.route_scale = route_scale self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank self.qk_nope_head_dim = qk_nope_head_dim self.qk_rope_head_dim = qk_rope_head_dim self.v_head_dim = v_head_dim self.original_seq_len = original_seq_len self.rope_theta = rope_theta self.rope_factor = rope_factor self.beta_fast = beta_fast self.beta_slow = beta_slow self.mscale = mscale self.max_batch_size = max_batch_size self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.tie_word_embeddings = tie_word_embeddings super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, )