atsuki-yamaguchi commited on
Commit
639b1f9
·
verified ·
1 Parent(s): 154cb9e

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,33 +1,21 @@
1
  ---
2
- license: mit
3
- language:
4
- - ar
5
  ---
6
- Mistral-7B LAPT + CLP+ Arabic
7
- ===
8
 
9
- ## How to use
10
- ```python
11
- from peft import AutoPeftModelForCausalLM
12
- from transformers import AutoTokenizer
13
 
14
- model = AutoPeftModelForCausalLM.from_pretrained(
15
- "atsuki-yamaguchi/Mistral-7B-v0.1-clpp-ar"
16
- )
17
- ```
 
 
 
 
 
 
 
 
18
 
19
- ## Citation
20
- ```
21
- @article{yamaguchi2024empirical,
22
- title={An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient Generative {LLM} Inference},
23
- author={Atsuki Yamaguchi and Aline Villavicencio and Nikolaos Aletras},
24
- journal={ArXiv},
25
- year={2024},
26
- volume={abs/2402.10712},
27
- url={https://arxiv.org/abs/2402.10712}
28
- }
29
- ```
30
-
31
- ## Link
32
- For more details, please visit https://github.com/gucci-j/llm-cva
33
 
 
 
1
  ---
2
+ library_name: peft
 
 
3
  ---
4
+ ## Training procedure
 
5
 
 
 
 
 
6
 
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: True
10
+ - load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ ### Framework versions
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ - PEFT 0.5.0
adapter_config.json CHANGED
@@ -1,29 +1 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "atsuki-yamaguchi/Mistral-7B-v0.1-clpp-ar",
4
- "bias": "none",
5
- "fan_in_fan_out": false,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 32,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": [
13
- "lm_head",
14
- "embed_tokens"
15
- ],
16
- "peft_type": "LORA",
17
- "r": 8,
18
- "revision": null,
19
- "target_modules": [
20
- "q_proj",
21
- "v_proj",
22
- "k_proj",
23
- "o_proj",
24
- "gate_proj",
25
- "down_proj",
26
- "up_proj"
27
- ],
28
- "task_type": "CAUSAL_LM"
29
- }
 
1
+ {"auto_mapping": null, "base_model_name_or_path": "atsuki-yamaguchi/Mistral-7B-v0.1-clpp-ar", "bias": "none", "fan_in_fan_out": false, "inference_mode": true, "init_lora_weights": true, "layers_pattern": null, "layers_to_transform": null, "lora_alpha": 32, "lora_dropout": 0.05, "modules_to_save": ["lm_head", "embed_tokens"], "peft_type": "LORA", "r": 8, "revision": null, "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"], "task_type": "CAUSAL_LM"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Mistral-7B-v0.1-ar-clp-plus",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
@@ -18,7 +18,7 @@
18
  "rope_theta": 10000.0,
19
  "sliding_window": 4096,
20
  "tie_word_embeddings": false,
21
- "torch_dtype": "float32",
22
  "transformers_version": "4.35.0.dev0",
23
  "use_cache": true,
24
  "vocab_size": 64000
 
1
  {
2
+ "_name_or_path": "mistralai/Mistral-7B-v0.1",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
18
  "rope_theta": 10000.0,
19
  "sliding_window": 4096,
20
  "tie_word_embeddings": false,
21
+ "torch_dtype": "float64",
22
  "transformers_version": "4.35.0.dev0",
23
  "use_cache": true,
24
  "vocab_size": 64000
model-00001-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18776d2880936a5fec755d529088f4ac00a15970e949e6c84606b6256b3a207b
3
- size 4941026032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84f66a3b315fab959b9e7e6a6a1b60d36e791307fc3da04008fd98b0b82190c7
3
+ size 4882271864
model-00002-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d32f4e850f6bda3b1aa1fd85a372377274b9a8a166dab9047fd4cd958199746
3
- size 4999813072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f0bac6759c14ee4d41847d0bae723b5a6a49581431401e0e1bde7e727c0d1aa
3
+ size 4832007448
model-00003-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f3b1794261ba1e27810d553cd2b5f949fcc3a6fe37ba9b09f5033bc7e5de472
3
- size 4832007496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26363511ef9b74574e4b1d0240bd8cfdd94e4f7b0c89ef4ff70da486d804051d
3
+ size 4999813112
model-00004-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f72da02290a5f95304a8cff4c7216f30405a3f0072ac1b2b1a906e9748ff10a7
3
- size 4999813120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e9ffd6a5d87b5ef96d752d8d1558aa36ac3fc5d05e9c4f6162061cec092b116
3
+ size 4999813128
model-00005-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f289adb9a536fe1aed9ba102e26c6b07185a467cb77def237d8715d378831662
3
- size 4999813128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76a7302529c832d7cfc43e98e64ccf942c9ddd3e574f283ae2e596df8f9ae364
3
+ size 4832007496
model-00006-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7c45253c17a967ef82cbb9b04de5020794f951c0b749205f2f533d6e746962d
3
- size 4194489072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40c1a941d32a3c23f0a37ecfa4b38ea8eb7913a5373bb9244277309935e84d69
3
+ size 4999813120
model-00007-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb2eaae68b06a4407d7a8c4a51ada496a85cabb19cc7788afe5d7261f4c2b0ee
3
- size 1048576128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f1e2c53dd876cc21962855732b7519ceb5603066c1555fd76dd66af90043579
3
+ size 1518387880
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 30015504384
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00007-of-00007.safetensors",
@@ -28,10 +28,10 @@
28
  "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
29
  "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
30
  "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
31
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
32
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
33
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
34
- "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
35
  "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
36
  "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
37
  "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
@@ -59,24 +59,24 @@
59
  "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
60
  "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
61
  "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
62
- "model.layers.14.input_layernorm.weight": "model-00003-of-00007.safetensors",
63
- "model.layers.14.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
64
  "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
65
- "model.layers.14.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
66
- "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
67
  "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
68
  "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
69
  "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
70
  "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
71
  "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
72
  "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
73
- "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
74
- "model.layers.15.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
75
  "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
76
- "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
77
- "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
78
- "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
79
- "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
80
  "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
81
  "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
82
  "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
@@ -122,24 +122,24 @@
122
  "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
123
  "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
124
  "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
125
- "model.layers.20.input_layernorm.weight": "model-00004-of-00007.safetensors",
126
- "model.layers.20.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
127
- "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
128
- "model.layers.20.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
129
- "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
130
  "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
131
  "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
132
  "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
133
  "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
134
  "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
135
  "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
136
- "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
137
  "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
138
  "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
139
- "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
140
- "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
141
- "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
142
- "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
143
  "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
144
  "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
145
  "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
@@ -167,33 +167,33 @@
167
  "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
168
  "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
169
  "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
170
- "model.layers.25.input_layernorm.weight": "model-00005-of-00007.safetensors",
171
- "model.layers.25.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
172
  "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
173
  "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
174
- "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
175
  "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
176
  "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
177
  "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
178
  "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
179
- "model.layers.26.input_layernorm.weight": "model-00005-of-00007.safetensors",
180
- "model.layers.26.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
181
- "model.layers.26.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
182
- "model.layers.26.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
183
- "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
184
- "model.layers.26.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
185
- "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
186
- "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
187
- "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
188
  "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
189
  "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
190
  "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
191
  "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
192
  "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
193
- "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
194
- "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
195
- "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
196
- "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
197
  "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
198
  "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
199
  "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
@@ -212,11 +212,11 @@
212
  "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
213
  "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
214
  "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
215
- "model.layers.3.input_layernorm.weight": "model-00001-of-00007.safetensors",
216
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
217
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
218
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
219
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
220
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
221
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
222
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
@@ -230,24 +230,24 @@
230
  "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
231
  "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
232
  "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
233
- "model.layers.31.input_layernorm.weight": "model-00006-of-00007.safetensors",
234
- "model.layers.31.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
235
  "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
236
- "model.layers.31.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
237
- "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
238
  "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
239
  "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
240
  "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
241
  "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
242
  "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
243
  "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
244
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
245
  "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
246
  "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
247
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
248
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
249
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
250
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
251
  "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
252
  "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
253
  "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
@@ -275,24 +275,24 @@
275
  "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
276
  "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
277
  "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
278
- "model.layers.8.input_layernorm.weight": "model-00002-of-00007.safetensors",
279
- "model.layers.8.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
280
  "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
281
  "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
282
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
283
  "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
284
  "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
285
  "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
286
  "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
287
- "model.layers.9.input_layernorm.weight": "model-00002-of-00007.safetensors",
288
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
289
- "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
290
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
291
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
292
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
293
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
294
- "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
295
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
296
- "model.norm.weight": "model-00006-of-00007.safetensors"
297
  }
298
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 31064080384
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00007-of-00007.safetensors",
 
28
  "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
29
  "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
30
  "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
35
  "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
36
  "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
37
  "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
 
59
  "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
60
  "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
61
  "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
64
  "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
67
  "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
68
  "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
69
  "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
70
  "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
71
  "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
72
  "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
75
  "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
80
  "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
81
  "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
82
  "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
 
122
  "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
123
  "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
124
  "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
130
  "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
131
  "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
132
  "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
133
  "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
134
  "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
135
  "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
137
  "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
138
  "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
143
  "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
144
  "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
145
  "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
 
167
  "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
168
  "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
169
  "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
172
  "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
173
  "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
175
  "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
176
  "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
177
  "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
178
  "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
188
  "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
189
  "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
190
  "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
191
  "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
192
  "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
197
  "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
198
  "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
199
  "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
 
212
  "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
213
  "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
214
  "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
220
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
221
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
222
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
 
230
  "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
231
  "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
232
  "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
235
  "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
238
  "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
239
  "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
240
  "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
241
  "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
242
  "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
243
  "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
245
  "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
246
  "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
251
  "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
252
  "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
253
  "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
 
275
  "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
276
  "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
277
  "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
280
  "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
281
  "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
283
  "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
284
  "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
285
  "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
286
  "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
296
+ "model.norm.weight": "model-00007-of-00007.safetensors"
297
  }
298
  }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:588176f88946a2ff8c488d92af398828e3f8541fb47d24eef98c671e22da7a1a
3
+ size 1093167324
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f21655ba8bdde9d44ecb6be12e1fe5543c4cdb67e0e5a746b9dcacce9091f703
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c502c4839bd0e8f15b5c7edfd874c07c774ff47fb7f76292ac238af80666792
3
+ size 1064
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51a88219c1d227ec061b20873c4e395ad7b93bc7b6eda6a800ea6ff797967234
3
+ size 4664