lukasmoeller commited on
Commit
7154cc3
1 Parent(s): e75df47

Upload MPTForCausalLM

Browse files
config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "mosaicml/mpt-7b",
3
+ "architectures": [
4
+ "MPTForCausalLM"
5
+ ],
6
+ "attn_config": {
7
+ "alibi": true,
8
+ "alibi_bias_max": 8,
9
+ "attn_impl": "torch",
10
+ "attn_pdrop": 0,
11
+ "attn_type": "multihead_attention",
12
+ "attn_uses_sequence_id": false,
13
+ "clip_qkv": null,
14
+ "prefix_lm": false,
15
+ "qk_ln": false,
16
+ "softmax_scale": null
17
+ },
18
+ "auto_map": {
19
+ "AutoConfig": "configuration_mpt.MPTConfig",
20
+ "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
21
+ },
22
+ "d_model": 4096,
23
+ "emb_pdrop": 0,
24
+ "embedding_fraction": 1.0,
25
+ "expansion_ratio": 4,
26
+ "init_config": {
27
+ "emb_init_std": null,
28
+ "emb_init_uniform_lim": null,
29
+ "fan_mode": "fan_in",
30
+ "init_div_is_residual": true,
31
+ "init_gain": 0,
32
+ "init_nonlinearity": "relu",
33
+ "init_std": 0.02,
34
+ "name": "kaiming_normal_",
35
+ "verbose": 0
36
+ },
37
+ "init_device": "cpu",
38
+ "learned_pos_emb": true,
39
+ "logit_scale": null,
40
+ "max_seq_len": 2048,
41
+ "model_type": "mpt",
42
+ "n_heads": 32,
43
+ "n_layers": 32,
44
+ "no_bias": true,
45
+ "norm_type": "low_precision_layernorm",
46
+ "resid_pdrop": 0,
47
+ "tokenizer_name": "EleutherAI/gpt-neox-20b",
48
+ "torch_dtype": "float32",
49
+ "transformers_version": "4.28.1",
50
+ "use_cache": false,
51
+ "verbose": 0,
52
+ "vocab_size": 50432
53
+ }
configuration_mpt.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A HuggingFace-style model configuration."""
2
+ from typing import Dict, Optional, Union
3
+ from transformers import PretrainedConfig
4
+ attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
5
+ init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu'}
6
+
7
+ class MPTConfig(PretrainedConfig):
8
+ model_type = 'mpt'
9
+
10
+ def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
11
+ """The MPT configuration class.
12
+
13
+ Args:
14
+ d_model (int): The size of the embedding dimension of the model.
15
+ n_heads (int): The number of attention heads.
16
+ n_layers (int): The number of layers in the model.
17
+ expansion_ratio (int): The ratio of the up/down scale in the MLP.
18
+ max_seq_len (int): The maximum sequence length of the model.
19
+ vocab_size (int): The size of the vocabulary.
20
+ resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
21
+ emb_pdrop (float): The dropout probability for the embedding layer.
22
+ learned_pos_emb (bool): Whether to use learned positional embeddings
23
+ attn_config (Dict): A dictionary used to configure the model's attention module:
24
+ attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
25
+ attn_pdrop (float): The dropout probability for the attention layers.
26
+ attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
27
+ qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
28
+ clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
29
+ this value.
30
+ softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
31
+ use the default scale of ``1/sqrt(d_keys)``.
32
+ prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
33
+ extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
34
+ can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
35
+ attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
36
+ When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
37
+ which sub-sequence each token belongs to.
38
+ Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
39
+ alibi (bool): Whether to use the alibi bias instead of position embeddings.
40
+ alibi_bias_max (int): The maximum value of the alibi bias.
41
+ init_device (str): The device to use for parameter initialization.
42
+ logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
43
+ no_bias (bool): Whether to use bias in all layers.
44
+ verbose (int): The verbosity level. 0 is silent.
45
+ embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
46
+ norm_type (str): choose type of norm to use
47
+ multiquery_attention (bool): Whether to use multiquery attention implementation.
48
+ use_cache (bool): Whether or not the model should return the last key/values attentions
49
+ init_config (Dict): A dictionary used to configure the model initialization:
50
+ init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
51
+ 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
52
+ 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
53
+ init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
54
+ emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
55
+ emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
56
+ used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
57
+ init_std (float): The standard deviation of the normal distribution used to initialize the model,
58
+ if using the baseline_ parameter initialization scheme.
59
+ init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
60
+ fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
61
+ init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
62
+ ---
63
+ See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
64
+ """
65
+ self.d_model = d_model
66
+ self.n_heads = n_heads
67
+ self.n_layers = n_layers
68
+ self.expansion_ratio = expansion_ratio
69
+ self.max_seq_len = max_seq_len
70
+ self.vocab_size = vocab_size
71
+ self.resid_pdrop = resid_pdrop
72
+ self.emb_pdrop = emb_pdrop
73
+ self.learned_pos_emb = learned_pos_emb
74
+ self.attn_config = attn_config
75
+ self.init_device = init_device
76
+ self.logit_scale = logit_scale
77
+ self.no_bias = no_bias
78
+ self.verbose = verbose
79
+ self.embedding_fraction = embedding_fraction
80
+ self.norm_type = norm_type
81
+ self.use_cache = use_cache
82
+ self.init_config = init_config
83
+ if 'name' in kwargs:
84
+ del kwargs['name']
85
+ if 'loss_fn' in kwargs:
86
+ del kwargs['loss_fn']
87
+ super().__init__(**kwargs)
88
+ self._validate_config()
89
+
90
+ def _set_config_defaults(self, config, config_defaults):
91
+ for (k, v) in config_defaults.items():
92
+ if k not in config:
93
+ config[k] = v
94
+ return config
95
+
96
+ def _validate_config(self):
97
+ self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
98
+ self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
99
+ if self.d_model % self.n_heads != 0:
100
+ raise ValueError('d_model must be divisible by n_heads')
101
+ if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
102
+ raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
103
+ if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
104
+ raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
105
+ if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
106
+ raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
107
+ if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
108
+ raise NotImplementedError('alibi only implemented with torch and triton attention.')
109
+ if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
110
+ raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
111
+ if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
112
+ raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
113
+ if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
114
+ raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
115
+ if self.init_config.get('name', None) is None:
116
+ raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
117
+ if not self.learned_pos_emb and (not self.attn_config['alibi']):
118
+ raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.28.1",
4
+ "use_cache": false
5
+ }
pytorch_model-00001-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7cbe07a0e01da78114b07a17a0b51bb4b5e1a717aa712262d853c2b15273c35
3
+ size 9953500837
pytorch_model-00002-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f436bd1646b321bd766664993ad1e9bb75342f6cb964d80fc62cdb1c33a6940
3
+ size 9932530109
pytorch_model-00003-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c144331f808af6131ed7a3ea9945b879a301efc7a015ef5fc686a2255851f555
3
+ size 6711181899
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 26597146624
4
+ },
5
+ "weight_map": {
6
+ "transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
7
+ "transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
8
+ "transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
9
+ "transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
10
+ "transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-00003.bin",
11
+ "transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-00003.bin",
12
+ "transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
13
+ "transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
14
+ "transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
15
+ "transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
16
+ "transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-00003.bin",
17
+ "transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-00003.bin",
18
+ "transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
19
+ "transformer.blocks.10.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
20
+ "transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
21
+ "transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
22
+ "transformer.blocks.10.norm_1.weight": "pytorch_model-00001-of-00003.bin",
23
+ "transformer.blocks.10.norm_2.weight": "pytorch_model-00001-of-00003.bin",
24
+ "transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
25
+ "transformer.blocks.11.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
26
+ "transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
27
+ "transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
28
+ "transformer.blocks.11.norm_1.weight": "pytorch_model-00001-of-00003.bin",
29
+ "transformer.blocks.11.norm_2.weight": "pytorch_model-00001-of-00003.bin",
30
+ "transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
31
+ "transformer.blocks.12.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
32
+ "transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
33
+ "transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
34
+ "transformer.blocks.12.norm_1.weight": "pytorch_model-00002-of-00003.bin",
35
+ "transformer.blocks.12.norm_2.weight": "pytorch_model-00002-of-00003.bin",
36
+ "transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
37
+ "transformer.blocks.13.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
38
+ "transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
39
+ "transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
40
+ "transformer.blocks.13.norm_1.weight": "pytorch_model-00002-of-00003.bin",
41
+ "transformer.blocks.13.norm_2.weight": "pytorch_model-00002-of-00003.bin",
42
+ "transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
43
+ "transformer.blocks.14.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
44
+ "transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
45
+ "transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
46
+ "transformer.blocks.14.norm_1.weight": "pytorch_model-00002-of-00003.bin",
47
+ "transformer.blocks.14.norm_2.weight": "pytorch_model-00002-of-00003.bin",
48
+ "transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
49
+ "transformer.blocks.15.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
50
+ "transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
51
+ "transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
52
+ "transformer.blocks.15.norm_1.weight": "pytorch_model-00002-of-00003.bin",
53
+ "transformer.blocks.15.norm_2.weight": "pytorch_model-00002-of-00003.bin",
54
+ "transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
55
+ "transformer.blocks.16.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
56
+ "transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
57
+ "transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
58
+ "transformer.blocks.16.norm_1.weight": "pytorch_model-00002-of-00003.bin",
59
+ "transformer.blocks.16.norm_2.weight": "pytorch_model-00002-of-00003.bin",
60
+ "transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
61
+ "transformer.blocks.17.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
62
+ "transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
63
+ "transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
64
+ "transformer.blocks.17.norm_1.weight": "pytorch_model-00002-of-00003.bin",
65
+ "transformer.blocks.17.norm_2.weight": "pytorch_model-00002-of-00003.bin",
66
+ "transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
67
+ "transformer.blocks.18.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
68
+ "transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
69
+ "transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
70
+ "transformer.blocks.18.norm_1.weight": "pytorch_model-00002-of-00003.bin",
71
+ "transformer.blocks.18.norm_2.weight": "pytorch_model-00002-of-00003.bin",
72
+ "transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
73
+ "transformer.blocks.19.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
74
+ "transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
75
+ "transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
76
+ "transformer.blocks.19.norm_1.weight": "pytorch_model-00002-of-00003.bin",
77
+ "transformer.blocks.19.norm_2.weight": "pytorch_model-00002-of-00003.bin",
78
+ "transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
79
+ "transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
80
+ "transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
81
+ "transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
82
+ "transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-00003.bin",
83
+ "transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-00003.bin",
84
+ "transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
85
+ "transformer.blocks.20.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
86
+ "transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
87
+ "transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
88
+ "transformer.blocks.20.norm_1.weight": "pytorch_model-00002-of-00003.bin",
89
+ "transformer.blocks.20.norm_2.weight": "pytorch_model-00002-of-00003.bin",
90
+ "transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
91
+ "transformer.blocks.21.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
92
+ "transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
93
+ "transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
94
+ "transformer.blocks.21.norm_1.weight": "pytorch_model-00002-of-00003.bin",
95
+ "transformer.blocks.21.norm_2.weight": "pytorch_model-00002-of-00003.bin",
96
+ "transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
97
+ "transformer.blocks.22.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
98
+ "transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-00002-of-00003.bin",
99
+ "transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
100
+ "transformer.blocks.22.norm_1.weight": "pytorch_model-00002-of-00003.bin",
101
+ "transformer.blocks.22.norm_2.weight": "pytorch_model-00002-of-00003.bin",
102
+ "transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-00002-of-00003.bin",
103
+ "transformer.blocks.23.attn.out_proj.weight": "pytorch_model-00002-of-00003.bin",
104
+ "transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
105
+ "transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-00002-of-00003.bin",
106
+ "transformer.blocks.23.norm_1.weight": "pytorch_model-00002-of-00003.bin",
107
+ "transformer.blocks.23.norm_2.weight": "pytorch_model-00002-of-00003.bin",
108
+ "transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-00003-of-00003.bin",
109
+ "transformer.blocks.24.attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
110
+ "transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
111
+ "transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-00003-of-00003.bin",
112
+ "transformer.blocks.24.norm_1.weight": "pytorch_model-00003-of-00003.bin",
113
+ "transformer.blocks.24.norm_2.weight": "pytorch_model-00003-of-00003.bin",
114
+ "transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-00003-of-00003.bin",
115
+ "transformer.blocks.25.attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
116
+ "transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
117
+ "transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-00003-of-00003.bin",
118
+ "transformer.blocks.25.norm_1.weight": "pytorch_model-00003-of-00003.bin",
119
+ "transformer.blocks.25.norm_2.weight": "pytorch_model-00003-of-00003.bin",
120
+ "transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-00003-of-00003.bin",
121
+ "transformer.blocks.26.attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
122
+ "transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
123
+ "transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-00003-of-00003.bin",
124
+ "transformer.blocks.26.norm_1.weight": "pytorch_model-00003-of-00003.bin",
125
+ "transformer.blocks.26.norm_2.weight": "pytorch_model-00003-of-00003.bin",
126
+ "transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-00003-of-00003.bin",
127
+ "transformer.blocks.27.attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
128
+ "transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
129
+ "transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-00003-of-00003.bin",
130
+ "transformer.blocks.27.norm_1.weight": "pytorch_model-00003-of-00003.bin",
131
+ "transformer.blocks.27.norm_2.weight": "pytorch_model-00003-of-00003.bin",
132
+ "transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-00003-of-00003.bin",
133
+ "transformer.blocks.28.attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
134
+ "transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
135
+ "transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-00003-of-00003.bin",
136
+ "transformer.blocks.28.norm_1.weight": "pytorch_model-00003-of-00003.bin",
137
+ "transformer.blocks.28.norm_2.weight": "pytorch_model-00003-of-00003.bin",
138
+ "transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-00003-of-00003.bin",
139
+ "transformer.blocks.29.attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
140
+ "transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
141
+ "transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-00003-of-00003.bin",
142
+ "transformer.blocks.29.norm_1.weight": "pytorch_model-00003-of-00003.bin",
143
+ "transformer.blocks.29.norm_2.weight": "pytorch_model-00003-of-00003.bin",
144
+ "transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
145
+ "transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
146
+ "transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
147
+ "transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
148
+ "transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-00003.bin",
149
+ "transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-00003.bin",
150
+ "transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-00003-of-00003.bin",
151
+ "transformer.blocks.30.attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
152
+ "transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
153
+ "transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-00003-of-00003.bin",
154
+ "transformer.blocks.30.norm_1.weight": "pytorch_model-00003-of-00003.bin",
155
+ "transformer.blocks.30.norm_2.weight": "pytorch_model-00003-of-00003.bin",
156
+ "transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-00003-of-00003.bin",
157
+ "transformer.blocks.31.attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
158
+ "transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-00003-of-00003.bin",
159
+ "transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-00003-of-00003.bin",
160
+ "transformer.blocks.31.norm_1.weight": "pytorch_model-00003-of-00003.bin",
161
+ "transformer.blocks.31.norm_2.weight": "pytorch_model-00003-of-00003.bin",
162
+ "transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
163
+ "transformer.blocks.4.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
164
+ "transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
165
+ "transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
166
+ "transformer.blocks.4.norm_1.weight": "pytorch_model-00001-of-00003.bin",
167
+ "transformer.blocks.4.norm_2.weight": "pytorch_model-00001-of-00003.bin",
168
+ "transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
169
+ "transformer.blocks.5.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
170
+ "transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
171
+ "transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
172
+ "transformer.blocks.5.norm_1.weight": "pytorch_model-00001-of-00003.bin",
173
+ "transformer.blocks.5.norm_2.weight": "pytorch_model-00001-of-00003.bin",
174
+ "transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
175
+ "transformer.blocks.6.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
176
+ "transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
177
+ "transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
178
+ "transformer.blocks.6.norm_1.weight": "pytorch_model-00001-of-00003.bin",
179
+ "transformer.blocks.6.norm_2.weight": "pytorch_model-00001-of-00003.bin",
180
+ "transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
181
+ "transformer.blocks.7.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
182
+ "transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
183
+ "transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
184
+ "transformer.blocks.7.norm_1.weight": "pytorch_model-00001-of-00003.bin",
185
+ "transformer.blocks.7.norm_2.weight": "pytorch_model-00001-of-00003.bin",
186
+ "transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
187
+ "transformer.blocks.8.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
188
+ "transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
189
+ "transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
190
+ "transformer.blocks.8.norm_1.weight": "pytorch_model-00001-of-00003.bin",
191
+ "transformer.blocks.8.norm_2.weight": "pytorch_model-00001-of-00003.bin",
192
+ "transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-00001-of-00003.bin",
193
+ "transformer.blocks.9.attn.out_proj.weight": "pytorch_model-00001-of-00003.bin",
194
+ "transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-00001-of-00003.bin",
195
+ "transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-00001-of-00003.bin",
196
+ "transformer.blocks.9.norm_1.weight": "pytorch_model-00001-of-00003.bin",
197
+ "transformer.blocks.9.norm_2.weight": "pytorch_model-00001-of-00003.bin",
198
+ "transformer.norm_f.weight": "pytorch_model-00003-of-00003.bin",
199
+ "transformer.wte.weight": "pytorch_model-00001-of-00003.bin"
200
+ }
201
+ }