from typing import Dict, List, Optional from transformers.configuration_utils import PretrainedConfig class ReneConfig(PretrainedConfig): r"""Configuration class for the Rene model. This is the configuration class to store the configuration of a [`ReneLMHeadModel`]. It is used to instantiate a Rene model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Rene-v0.1-1.3b-pytorch model. [cartesia-ai/Rene-v0.1-1.3b-pytorch](https://huggingface.co/cartesia-ai/Rene-v0.1-1.3b-pytorch) Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: d_model (`int`, *optional*, defaults to 2048): Dimension of the hidden representations. n_layer (`int`, *optional*, defaults to 48): Number of architecture blocks. vocab_size (`int`, *optional*, defaults to 50280): Vocabulary size of the Rene model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`ReneModel`]. ssm_cfg (`dict`, *optional*): Configuration parameters for the SSM layers. attn_layer_idx (`List[int]`, *optional*): Indices of the architecture blocks that should have attention layers. attn_cfg (`dict`, *optional*): Configuration parameters for the attention layers. mlp_layer_idx (`List[int]`, *optional*): Indices of the architecture blocks that should have MLP layers. mlp_cfg (`dict`, *optional*): Configuration parameters for the MLP layers. rms_norm (`bool`, *optional*, defaults to `True`): Whether to use RMSNorm (instead of LayerNorm). residual_in_fp32 (`bool`, *optional*, defaults to `True`): Whether to keep residual values in fp32. pad_vocab_size_multiple (`int`, *optional*, defaults to 16): Pad the vocabulary size up to the next multiple of this value. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer. pad_token_id (`int`, *optional*, defaults to 1): The id of the padding token. bos_token_id (`int`, *optional*): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 50279): The id of the "end-of-sequence" token. """ model_type = "rene" def __init__( self, d_model: int = 2048, n_layer: int = 48, vocab_size: int = 50280, ssm_cfg: Optional[Dict] = None, attn_layer_idx: Optional[List] = None, attn_cfg: Optional[Dict] = None, mlp_layer_idx: Optional[List] = None, mlp_cfg: Optional[Dict] = None, rms_norm: bool = True, residual_in_fp32: bool = True, pad_vocab_size_multiple: int = 16, tie_word_embeddings: bool = True, pad_token_id=1, bos_token_id=None, eos_token_id=50279, **kwargs, ): if ssm_cfg is None: ssm_cfg = {} if attn_layer_idx is None: attn_layer_idx = [] if attn_cfg is None: attn_cfg = {} if mlp_layer_idx is None: mlp_layer_idx = [] if mlp_cfg is None: mlp_cfg = {} self.d_model = d_model self.n_layer = n_layer self.vocab_size = vocab_size self.ssm_cfg = ssm_cfg self.attn_layer_idx = attn_layer_idx self.attn_cfg = attn_cfg self.mlp_layer_idx = mlp_layer_idx self.mlp_cfg = mlp_cfg self.rms_norm = rms_norm self.residual_in_fp32 = residual_in_fp32 self.pad_vocab_size_multiple = pad_vocab_size_multiple self.tie_word_embeddings = tie_word_embeddings super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, )