KotshinZ commited on
Commit
6d3b002
·
verified ·
1 Parent(s): 1c299dc

Model save

Browse files
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: tiiuae/falcon-mamba-7b-instruct
3
+ library_name: transformers
4
+ model_name: mamba-distill-7b
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for mamba-distill-7b
13
+
14
+ This model is a fine-tuned version of [tiiuae/falcon-mamba-7b-instruct](https://huggingface.co/tiiuae/falcon-mamba-7b-instruct).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="KotshinZ/mamba-distill-7b", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/shin2021001-osaka-city-university/huggingface/runs/fw25pcxt)
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.2
38
+ - Transformers: 4.50.1
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.4.1
41
+ - Tokenizers: 0.21.1
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 3.616227566899167e+18,
3
+ "train_loss": 16.595592213948567,
4
+ "train_runtime": 29081.677,
5
+ "train_samples": 16610,
6
+ "train_samples_per_second": 0.827,
7
+ "train_steps_per_second": 0.013
8
+ }
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "FalconMambaForCausalLM"
4
+ ],
5
+ "bos_token_id": 8,
6
+ "conv_kernel": 4,
7
+ "eos_token_id": 11,
8
+ "expand": 16,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.1,
12
+ "intermediate_size": 8192,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "mixer_rms_eps": 1e-06,
15
+ "model_type": "falcon_mamba",
16
+ "num_hidden_layers": 64,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": false,
19
+ "residual_in_fp32": true,
20
+ "state_size": 16,
21
+ "tie_word_embeddings": false,
22
+ "time_step_floor": 0.0001,
23
+ "time_step_init_scheme": "random",
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 256,
27
+ "time_step_scale": 1.0,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.50.1",
30
+ "use_bias": false,
31
+ "use_cache": false,
32
+ "use_conv_bias": true,
33
+ "use_mambapy": false,
34
+ "vocab_size": 65024
35
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 8,
4
+ "eos_token_id": [
5
+ 11,
6
+ 10
7
+ ],
8
+ "pad_token_id": 0,
9
+ "transformers_version": "4.50.1"
10
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f39e223c48d308387ded14110ce989bb372db7277cbef5daec20cfa023cc66f
3
+ size 4956184024
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbed912f504bd39d4172e756af4817868189b6eba210d502da3facec29aececa
3
+ size 4987536920
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f1bd09a9aefbd68a9dddd0763450beb3c785ce39c9c29a98a0b3753adc9d5b6
3
+ size 4601680888
model.safetensors.index.json ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 14545330176
4
+ },
5
+ "weight_map": {
6
+ "backbone.embeddings.weight": "model-00001-of-00003.safetensors",
7
+ "backbone.layers.0.mixer.A_log": "model-00001-of-00003.safetensors",
8
+ "backbone.layers.0.mixer.D": "model-00001-of-00003.safetensors",
9
+ "backbone.layers.0.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
10
+ "backbone.layers.0.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
11
+ "backbone.layers.0.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
12
+ "backbone.layers.0.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
13
+ "backbone.layers.0.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
14
+ "backbone.layers.0.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
15
+ "backbone.layers.0.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
16
+ "backbone.layers.0.norm.weight": "model-00001-of-00003.safetensors",
17
+ "backbone.layers.1.mixer.A_log": "model-00001-of-00003.safetensors",
18
+ "backbone.layers.1.mixer.D": "model-00001-of-00003.safetensors",
19
+ "backbone.layers.1.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
20
+ "backbone.layers.1.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
21
+ "backbone.layers.1.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
22
+ "backbone.layers.1.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
23
+ "backbone.layers.1.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
24
+ "backbone.layers.1.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
25
+ "backbone.layers.1.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
26
+ "backbone.layers.1.norm.weight": "model-00001-of-00003.safetensors",
27
+ "backbone.layers.10.mixer.A_log": "model-00001-of-00003.safetensors",
28
+ "backbone.layers.10.mixer.D": "model-00001-of-00003.safetensors",
29
+ "backbone.layers.10.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
30
+ "backbone.layers.10.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
31
+ "backbone.layers.10.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
32
+ "backbone.layers.10.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
33
+ "backbone.layers.10.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
34
+ "backbone.layers.10.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
35
+ "backbone.layers.10.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
36
+ "backbone.layers.10.norm.weight": "model-00001-of-00003.safetensors",
37
+ "backbone.layers.11.mixer.A_log": "model-00001-of-00003.safetensors",
38
+ "backbone.layers.11.mixer.D": "model-00001-of-00003.safetensors",
39
+ "backbone.layers.11.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
40
+ "backbone.layers.11.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
41
+ "backbone.layers.11.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
42
+ "backbone.layers.11.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
43
+ "backbone.layers.11.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
44
+ "backbone.layers.11.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
45
+ "backbone.layers.11.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
46
+ "backbone.layers.11.norm.weight": "model-00001-of-00003.safetensors",
47
+ "backbone.layers.12.mixer.A_log": "model-00001-of-00003.safetensors",
48
+ "backbone.layers.12.mixer.D": "model-00001-of-00003.safetensors",
49
+ "backbone.layers.12.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
50
+ "backbone.layers.12.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
51
+ "backbone.layers.12.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
52
+ "backbone.layers.12.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
53
+ "backbone.layers.12.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
54
+ "backbone.layers.12.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
55
+ "backbone.layers.12.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
56
+ "backbone.layers.12.norm.weight": "model-00001-of-00003.safetensors",
57
+ "backbone.layers.13.mixer.A_log": "model-00001-of-00003.safetensors",
58
+ "backbone.layers.13.mixer.D": "model-00001-of-00003.safetensors",
59
+ "backbone.layers.13.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
60
+ "backbone.layers.13.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
61
+ "backbone.layers.13.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
62
+ "backbone.layers.13.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
63
+ "backbone.layers.13.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
64
+ "backbone.layers.13.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
65
+ "backbone.layers.13.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
66
+ "backbone.layers.13.norm.weight": "model-00001-of-00003.safetensors",
67
+ "backbone.layers.14.mixer.A_log": "model-00001-of-00003.safetensors",
68
+ "backbone.layers.14.mixer.D": "model-00001-of-00003.safetensors",
69
+ "backbone.layers.14.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
70
+ "backbone.layers.14.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
71
+ "backbone.layers.14.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
72
+ "backbone.layers.14.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
73
+ "backbone.layers.14.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
74
+ "backbone.layers.14.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
75
+ "backbone.layers.14.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
76
+ "backbone.layers.14.norm.weight": "model-00001-of-00003.safetensors",
77
+ "backbone.layers.15.mixer.A_log": "model-00001-of-00003.safetensors",
78
+ "backbone.layers.15.mixer.D": "model-00001-of-00003.safetensors",
79
+ "backbone.layers.15.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
80
+ "backbone.layers.15.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
81
+ "backbone.layers.15.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
82
+ "backbone.layers.15.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
83
+ "backbone.layers.15.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
84
+ "backbone.layers.15.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
85
+ "backbone.layers.15.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
86
+ "backbone.layers.15.norm.weight": "model-00001-of-00003.safetensors",
87
+ "backbone.layers.16.mixer.A_log": "model-00001-of-00003.safetensors",
88
+ "backbone.layers.16.mixer.D": "model-00001-of-00003.safetensors",
89
+ "backbone.layers.16.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
90
+ "backbone.layers.16.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
91
+ "backbone.layers.16.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
92
+ "backbone.layers.16.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
93
+ "backbone.layers.16.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
94
+ "backbone.layers.16.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
95
+ "backbone.layers.16.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
96
+ "backbone.layers.16.norm.weight": "model-00001-of-00003.safetensors",
97
+ "backbone.layers.17.mixer.A_log": "model-00001-of-00003.safetensors",
98
+ "backbone.layers.17.mixer.D": "model-00001-of-00003.safetensors",
99
+ "backbone.layers.17.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
100
+ "backbone.layers.17.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
101
+ "backbone.layers.17.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
102
+ "backbone.layers.17.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
103
+ "backbone.layers.17.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
104
+ "backbone.layers.17.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
105
+ "backbone.layers.17.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
106
+ "backbone.layers.17.norm.weight": "model-00001-of-00003.safetensors",
107
+ "backbone.layers.18.mixer.A_log": "model-00001-of-00003.safetensors",
108
+ "backbone.layers.18.mixer.D": "model-00001-of-00003.safetensors",
109
+ "backbone.layers.18.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
110
+ "backbone.layers.18.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
111
+ "backbone.layers.18.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
112
+ "backbone.layers.18.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
113
+ "backbone.layers.18.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
114
+ "backbone.layers.18.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
115
+ "backbone.layers.18.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
116
+ "backbone.layers.18.norm.weight": "model-00001-of-00003.safetensors",
117
+ "backbone.layers.19.mixer.A_log": "model-00001-of-00003.safetensors",
118
+ "backbone.layers.19.mixer.D": "model-00001-of-00003.safetensors",
119
+ "backbone.layers.19.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
120
+ "backbone.layers.19.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
121
+ "backbone.layers.19.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
122
+ "backbone.layers.19.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
123
+ "backbone.layers.19.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
124
+ "backbone.layers.19.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
125
+ "backbone.layers.19.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
126
+ "backbone.layers.19.norm.weight": "model-00001-of-00003.safetensors",
127
+ "backbone.layers.2.mixer.A_log": "model-00001-of-00003.safetensors",
128
+ "backbone.layers.2.mixer.D": "model-00001-of-00003.safetensors",
129
+ "backbone.layers.2.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
130
+ "backbone.layers.2.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
131
+ "backbone.layers.2.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
132
+ "backbone.layers.2.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
133
+ "backbone.layers.2.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
134
+ "backbone.layers.2.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
135
+ "backbone.layers.2.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
136
+ "backbone.layers.2.norm.weight": "model-00001-of-00003.safetensors",
137
+ "backbone.layers.20.mixer.A_log": "model-00001-of-00003.safetensors",
138
+ "backbone.layers.20.mixer.D": "model-00001-of-00003.safetensors",
139
+ "backbone.layers.20.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
140
+ "backbone.layers.20.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
141
+ "backbone.layers.20.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
142
+ "backbone.layers.20.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
143
+ "backbone.layers.20.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
144
+ "backbone.layers.20.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
145
+ "backbone.layers.20.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
146
+ "backbone.layers.20.norm.weight": "model-00001-of-00003.safetensors",
147
+ "backbone.layers.21.mixer.A_log": "model-00001-of-00003.safetensors",
148
+ "backbone.layers.21.mixer.D": "model-00001-of-00003.safetensors",
149
+ "backbone.layers.21.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
150
+ "backbone.layers.21.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
151
+ "backbone.layers.21.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
152
+ "backbone.layers.21.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
153
+ "backbone.layers.21.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
154
+ "backbone.layers.21.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
155
+ "backbone.layers.21.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
156
+ "backbone.layers.21.norm.weight": "model-00001-of-00003.safetensors",
157
+ "backbone.layers.22.mixer.A_log": "model-00002-of-00003.safetensors",
158
+ "backbone.layers.22.mixer.D": "model-00002-of-00003.safetensors",
159
+ "backbone.layers.22.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
160
+ "backbone.layers.22.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
161
+ "backbone.layers.22.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
162
+ "backbone.layers.22.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
163
+ "backbone.layers.22.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
164
+ "backbone.layers.22.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
165
+ "backbone.layers.22.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
166
+ "backbone.layers.22.norm.weight": "model-00002-of-00003.safetensors",
167
+ "backbone.layers.23.mixer.A_log": "model-00002-of-00003.safetensors",
168
+ "backbone.layers.23.mixer.D": "model-00002-of-00003.safetensors",
169
+ "backbone.layers.23.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
170
+ "backbone.layers.23.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
171
+ "backbone.layers.23.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
172
+ "backbone.layers.23.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
173
+ "backbone.layers.23.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
174
+ "backbone.layers.23.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
175
+ "backbone.layers.23.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
176
+ "backbone.layers.23.norm.weight": "model-00002-of-00003.safetensors",
177
+ "backbone.layers.24.mixer.A_log": "model-00002-of-00003.safetensors",
178
+ "backbone.layers.24.mixer.D": "model-00002-of-00003.safetensors",
179
+ "backbone.layers.24.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
180
+ "backbone.layers.24.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
181
+ "backbone.layers.24.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
182
+ "backbone.layers.24.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
183
+ "backbone.layers.24.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
184
+ "backbone.layers.24.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
185
+ "backbone.layers.24.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
186
+ "backbone.layers.24.norm.weight": "model-00002-of-00003.safetensors",
187
+ "backbone.layers.25.mixer.A_log": "model-00002-of-00003.safetensors",
188
+ "backbone.layers.25.mixer.D": "model-00002-of-00003.safetensors",
189
+ "backbone.layers.25.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
190
+ "backbone.layers.25.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
191
+ "backbone.layers.25.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
192
+ "backbone.layers.25.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
193
+ "backbone.layers.25.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
194
+ "backbone.layers.25.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
195
+ "backbone.layers.25.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
196
+ "backbone.layers.25.norm.weight": "model-00002-of-00003.safetensors",
197
+ "backbone.layers.26.mixer.A_log": "model-00002-of-00003.safetensors",
198
+ "backbone.layers.26.mixer.D": "model-00002-of-00003.safetensors",
199
+ "backbone.layers.26.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
200
+ "backbone.layers.26.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
201
+ "backbone.layers.26.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
202
+ "backbone.layers.26.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
203
+ "backbone.layers.26.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
204
+ "backbone.layers.26.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
205
+ "backbone.layers.26.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
206
+ "backbone.layers.26.norm.weight": "model-00002-of-00003.safetensors",
207
+ "backbone.layers.27.mixer.A_log": "model-00002-of-00003.safetensors",
208
+ "backbone.layers.27.mixer.D": "model-00002-of-00003.safetensors",
209
+ "backbone.layers.27.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
210
+ "backbone.layers.27.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
211
+ "backbone.layers.27.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
212
+ "backbone.layers.27.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
213
+ "backbone.layers.27.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
214
+ "backbone.layers.27.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
215
+ "backbone.layers.27.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
216
+ "backbone.layers.27.norm.weight": "model-00002-of-00003.safetensors",
217
+ "backbone.layers.28.mixer.A_log": "model-00002-of-00003.safetensors",
218
+ "backbone.layers.28.mixer.D": "model-00002-of-00003.safetensors",
219
+ "backbone.layers.28.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
220
+ "backbone.layers.28.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
221
+ "backbone.layers.28.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
222
+ "backbone.layers.28.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
223
+ "backbone.layers.28.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
224
+ "backbone.layers.28.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
225
+ "backbone.layers.28.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
226
+ "backbone.layers.28.norm.weight": "model-00002-of-00003.safetensors",
227
+ "backbone.layers.29.mixer.A_log": "model-00002-of-00003.safetensors",
228
+ "backbone.layers.29.mixer.D": "model-00002-of-00003.safetensors",
229
+ "backbone.layers.29.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
230
+ "backbone.layers.29.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
231
+ "backbone.layers.29.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
232
+ "backbone.layers.29.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
233
+ "backbone.layers.29.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
234
+ "backbone.layers.29.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
235
+ "backbone.layers.29.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
236
+ "backbone.layers.29.norm.weight": "model-00002-of-00003.safetensors",
237
+ "backbone.layers.3.mixer.A_log": "model-00001-of-00003.safetensors",
238
+ "backbone.layers.3.mixer.D": "model-00001-of-00003.safetensors",
239
+ "backbone.layers.3.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
240
+ "backbone.layers.3.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
241
+ "backbone.layers.3.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
242
+ "backbone.layers.3.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
243
+ "backbone.layers.3.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
244
+ "backbone.layers.3.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
245
+ "backbone.layers.3.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
246
+ "backbone.layers.3.norm.weight": "model-00001-of-00003.safetensors",
247
+ "backbone.layers.30.mixer.A_log": "model-00002-of-00003.safetensors",
248
+ "backbone.layers.30.mixer.D": "model-00002-of-00003.safetensors",
249
+ "backbone.layers.30.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
250
+ "backbone.layers.30.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
251
+ "backbone.layers.30.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
252
+ "backbone.layers.30.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
253
+ "backbone.layers.30.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
254
+ "backbone.layers.30.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
255
+ "backbone.layers.30.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
256
+ "backbone.layers.30.norm.weight": "model-00002-of-00003.safetensors",
257
+ "backbone.layers.31.mixer.A_log": "model-00002-of-00003.safetensors",
258
+ "backbone.layers.31.mixer.D": "model-00002-of-00003.safetensors",
259
+ "backbone.layers.31.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
260
+ "backbone.layers.31.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
261
+ "backbone.layers.31.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
262
+ "backbone.layers.31.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
263
+ "backbone.layers.31.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
264
+ "backbone.layers.31.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
265
+ "backbone.layers.31.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
266
+ "backbone.layers.31.norm.weight": "model-00002-of-00003.safetensors",
267
+ "backbone.layers.32.mixer.A_log": "model-00002-of-00003.safetensors",
268
+ "backbone.layers.32.mixer.D": "model-00002-of-00003.safetensors",
269
+ "backbone.layers.32.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
270
+ "backbone.layers.32.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
271
+ "backbone.layers.32.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
272
+ "backbone.layers.32.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
273
+ "backbone.layers.32.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
274
+ "backbone.layers.32.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
275
+ "backbone.layers.32.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
276
+ "backbone.layers.32.norm.weight": "model-00002-of-00003.safetensors",
277
+ "backbone.layers.33.mixer.A_log": "model-00002-of-00003.safetensors",
278
+ "backbone.layers.33.mixer.D": "model-00002-of-00003.safetensors",
279
+ "backbone.layers.33.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
280
+ "backbone.layers.33.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
281
+ "backbone.layers.33.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
282
+ "backbone.layers.33.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
283
+ "backbone.layers.33.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
284
+ "backbone.layers.33.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
285
+ "backbone.layers.33.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
286
+ "backbone.layers.33.norm.weight": "model-00002-of-00003.safetensors",
287
+ "backbone.layers.34.mixer.A_log": "model-00002-of-00003.safetensors",
288
+ "backbone.layers.34.mixer.D": "model-00002-of-00003.safetensors",
289
+ "backbone.layers.34.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
290
+ "backbone.layers.34.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
291
+ "backbone.layers.34.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
292
+ "backbone.layers.34.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
293
+ "backbone.layers.34.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
294
+ "backbone.layers.34.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
295
+ "backbone.layers.34.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
296
+ "backbone.layers.34.norm.weight": "model-00002-of-00003.safetensors",
297
+ "backbone.layers.35.mixer.A_log": "model-00002-of-00003.safetensors",
298
+ "backbone.layers.35.mixer.D": "model-00002-of-00003.safetensors",
299
+ "backbone.layers.35.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
300
+ "backbone.layers.35.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
301
+ "backbone.layers.35.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
302
+ "backbone.layers.35.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
303
+ "backbone.layers.35.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
304
+ "backbone.layers.35.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
305
+ "backbone.layers.35.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
306
+ "backbone.layers.35.norm.weight": "model-00002-of-00003.safetensors",
307
+ "backbone.layers.36.mixer.A_log": "model-00002-of-00003.safetensors",
308
+ "backbone.layers.36.mixer.D": "model-00002-of-00003.safetensors",
309
+ "backbone.layers.36.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
310
+ "backbone.layers.36.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
311
+ "backbone.layers.36.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
312
+ "backbone.layers.36.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
313
+ "backbone.layers.36.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
314
+ "backbone.layers.36.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
315
+ "backbone.layers.36.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
316
+ "backbone.layers.36.norm.weight": "model-00002-of-00003.safetensors",
317
+ "backbone.layers.37.mixer.A_log": "model-00002-of-00003.safetensors",
318
+ "backbone.layers.37.mixer.D": "model-00002-of-00003.safetensors",
319
+ "backbone.layers.37.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
320
+ "backbone.layers.37.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
321
+ "backbone.layers.37.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
322
+ "backbone.layers.37.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
323
+ "backbone.layers.37.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
324
+ "backbone.layers.37.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
325
+ "backbone.layers.37.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
326
+ "backbone.layers.37.norm.weight": "model-00002-of-00003.safetensors",
327
+ "backbone.layers.38.mixer.A_log": "model-00002-of-00003.safetensors",
328
+ "backbone.layers.38.mixer.D": "model-00002-of-00003.safetensors",
329
+ "backbone.layers.38.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
330
+ "backbone.layers.38.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
331
+ "backbone.layers.38.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
332
+ "backbone.layers.38.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
333
+ "backbone.layers.38.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
334
+ "backbone.layers.38.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
335
+ "backbone.layers.38.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
336
+ "backbone.layers.38.norm.weight": "model-00002-of-00003.safetensors",
337
+ "backbone.layers.39.mixer.A_log": "model-00002-of-00003.safetensors",
338
+ "backbone.layers.39.mixer.D": "model-00002-of-00003.safetensors",
339
+ "backbone.layers.39.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
340
+ "backbone.layers.39.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
341
+ "backbone.layers.39.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
342
+ "backbone.layers.39.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
343
+ "backbone.layers.39.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
344
+ "backbone.layers.39.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
345
+ "backbone.layers.39.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
346
+ "backbone.layers.39.norm.weight": "model-00002-of-00003.safetensors",
347
+ "backbone.layers.4.mixer.A_log": "model-00001-of-00003.safetensors",
348
+ "backbone.layers.4.mixer.D": "model-00001-of-00003.safetensors",
349
+ "backbone.layers.4.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
350
+ "backbone.layers.4.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
351
+ "backbone.layers.4.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
352
+ "backbone.layers.4.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
353
+ "backbone.layers.4.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
354
+ "backbone.layers.4.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
355
+ "backbone.layers.4.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
356
+ "backbone.layers.4.norm.weight": "model-00001-of-00003.safetensors",
357
+ "backbone.layers.40.mixer.A_log": "model-00002-of-00003.safetensors",
358
+ "backbone.layers.40.mixer.D": "model-00002-of-00003.safetensors",
359
+ "backbone.layers.40.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
360
+ "backbone.layers.40.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
361
+ "backbone.layers.40.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
362
+ "backbone.layers.40.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
363
+ "backbone.layers.40.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
364
+ "backbone.layers.40.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
365
+ "backbone.layers.40.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
366
+ "backbone.layers.40.norm.weight": "model-00002-of-00003.safetensors",
367
+ "backbone.layers.41.mixer.A_log": "model-00002-of-00003.safetensors",
368
+ "backbone.layers.41.mixer.D": "model-00002-of-00003.safetensors",
369
+ "backbone.layers.41.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
370
+ "backbone.layers.41.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
371
+ "backbone.layers.41.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
372
+ "backbone.layers.41.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
373
+ "backbone.layers.41.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
374
+ "backbone.layers.41.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
375
+ "backbone.layers.41.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
376
+ "backbone.layers.41.norm.weight": "model-00002-of-00003.safetensors",
377
+ "backbone.layers.42.mixer.A_log": "model-00002-of-00003.safetensors",
378
+ "backbone.layers.42.mixer.D": "model-00002-of-00003.safetensors",
379
+ "backbone.layers.42.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
380
+ "backbone.layers.42.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
381
+ "backbone.layers.42.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
382
+ "backbone.layers.42.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
383
+ "backbone.layers.42.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
384
+ "backbone.layers.42.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
385
+ "backbone.layers.42.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
386
+ "backbone.layers.42.norm.weight": "model-00002-of-00003.safetensors",
387
+ "backbone.layers.43.mixer.A_log": "model-00002-of-00003.safetensors",
388
+ "backbone.layers.43.mixer.D": "model-00002-of-00003.safetensors",
389
+ "backbone.layers.43.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
390
+ "backbone.layers.43.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
391
+ "backbone.layers.43.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
392
+ "backbone.layers.43.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
393
+ "backbone.layers.43.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
394
+ "backbone.layers.43.mixer.out_proj.weight": "model-00002-of-00003.safetensors",
395
+ "backbone.layers.43.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
396
+ "backbone.layers.43.norm.weight": "model-00002-of-00003.safetensors",
397
+ "backbone.layers.44.mixer.A_log": "model-00002-of-00003.safetensors",
398
+ "backbone.layers.44.mixer.D": "model-00002-of-00003.safetensors",
399
+ "backbone.layers.44.mixer.conv1d.bias": "model-00002-of-00003.safetensors",
400
+ "backbone.layers.44.mixer.conv1d.weight": "model-00002-of-00003.safetensors",
401
+ "backbone.layers.44.mixer.dt_proj.bias": "model-00002-of-00003.safetensors",
402
+ "backbone.layers.44.mixer.dt_proj.weight": "model-00002-of-00003.safetensors",
403
+ "backbone.layers.44.mixer.in_proj.weight": "model-00002-of-00003.safetensors",
404
+ "backbone.layers.44.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
405
+ "backbone.layers.44.mixer.x_proj.weight": "model-00002-of-00003.safetensors",
406
+ "backbone.layers.44.norm.weight": "model-00002-of-00003.safetensors",
407
+ "backbone.layers.45.mixer.A_log": "model-00003-of-00003.safetensors",
408
+ "backbone.layers.45.mixer.D": "model-00003-of-00003.safetensors",
409
+ "backbone.layers.45.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
410
+ "backbone.layers.45.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
411
+ "backbone.layers.45.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
412
+ "backbone.layers.45.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
413
+ "backbone.layers.45.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
414
+ "backbone.layers.45.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
415
+ "backbone.layers.45.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
416
+ "backbone.layers.45.norm.weight": "model-00003-of-00003.safetensors",
417
+ "backbone.layers.46.mixer.A_log": "model-00003-of-00003.safetensors",
418
+ "backbone.layers.46.mixer.D": "model-00003-of-00003.safetensors",
419
+ "backbone.layers.46.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
420
+ "backbone.layers.46.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
421
+ "backbone.layers.46.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
422
+ "backbone.layers.46.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
423
+ "backbone.layers.46.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
424
+ "backbone.layers.46.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
425
+ "backbone.layers.46.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
426
+ "backbone.layers.46.norm.weight": "model-00003-of-00003.safetensors",
427
+ "backbone.layers.47.mixer.A_log": "model-00003-of-00003.safetensors",
428
+ "backbone.layers.47.mixer.D": "model-00003-of-00003.safetensors",
429
+ "backbone.layers.47.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
430
+ "backbone.layers.47.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
431
+ "backbone.layers.47.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
432
+ "backbone.layers.47.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
433
+ "backbone.layers.47.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
434
+ "backbone.layers.47.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
435
+ "backbone.layers.47.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
436
+ "backbone.layers.47.norm.weight": "model-00003-of-00003.safetensors",
437
+ "backbone.layers.48.mixer.A_log": "model-00003-of-00003.safetensors",
438
+ "backbone.layers.48.mixer.D": "model-00003-of-00003.safetensors",
439
+ "backbone.layers.48.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
440
+ "backbone.layers.48.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
441
+ "backbone.layers.48.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
442
+ "backbone.layers.48.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
443
+ "backbone.layers.48.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
444
+ "backbone.layers.48.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
445
+ "backbone.layers.48.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
446
+ "backbone.layers.48.norm.weight": "model-00003-of-00003.safetensors",
447
+ "backbone.layers.49.mixer.A_log": "model-00003-of-00003.safetensors",
448
+ "backbone.layers.49.mixer.D": "model-00003-of-00003.safetensors",
449
+ "backbone.layers.49.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
450
+ "backbone.layers.49.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
451
+ "backbone.layers.49.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
452
+ "backbone.layers.49.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
453
+ "backbone.layers.49.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
454
+ "backbone.layers.49.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
455
+ "backbone.layers.49.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
456
+ "backbone.layers.49.norm.weight": "model-00003-of-00003.safetensors",
457
+ "backbone.layers.5.mixer.A_log": "model-00001-of-00003.safetensors",
458
+ "backbone.layers.5.mixer.D": "model-00001-of-00003.safetensors",
459
+ "backbone.layers.5.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
460
+ "backbone.layers.5.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
461
+ "backbone.layers.5.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
462
+ "backbone.layers.5.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
463
+ "backbone.layers.5.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
464
+ "backbone.layers.5.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
465
+ "backbone.layers.5.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
466
+ "backbone.layers.5.norm.weight": "model-00001-of-00003.safetensors",
467
+ "backbone.layers.50.mixer.A_log": "model-00003-of-00003.safetensors",
468
+ "backbone.layers.50.mixer.D": "model-00003-of-00003.safetensors",
469
+ "backbone.layers.50.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
470
+ "backbone.layers.50.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
471
+ "backbone.layers.50.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
472
+ "backbone.layers.50.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
473
+ "backbone.layers.50.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
474
+ "backbone.layers.50.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
475
+ "backbone.layers.50.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
476
+ "backbone.layers.50.norm.weight": "model-00003-of-00003.safetensors",
477
+ "backbone.layers.51.mixer.A_log": "model-00003-of-00003.safetensors",
478
+ "backbone.layers.51.mixer.D": "model-00003-of-00003.safetensors",
479
+ "backbone.layers.51.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
480
+ "backbone.layers.51.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
481
+ "backbone.layers.51.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
482
+ "backbone.layers.51.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
483
+ "backbone.layers.51.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
484
+ "backbone.layers.51.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
485
+ "backbone.layers.51.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
486
+ "backbone.layers.51.norm.weight": "model-00003-of-00003.safetensors",
487
+ "backbone.layers.52.mixer.A_log": "model-00003-of-00003.safetensors",
488
+ "backbone.layers.52.mixer.D": "model-00003-of-00003.safetensors",
489
+ "backbone.layers.52.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
490
+ "backbone.layers.52.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
491
+ "backbone.layers.52.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
492
+ "backbone.layers.52.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
493
+ "backbone.layers.52.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
494
+ "backbone.layers.52.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
495
+ "backbone.layers.52.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
496
+ "backbone.layers.52.norm.weight": "model-00003-of-00003.safetensors",
497
+ "backbone.layers.53.mixer.A_log": "model-00003-of-00003.safetensors",
498
+ "backbone.layers.53.mixer.D": "model-00003-of-00003.safetensors",
499
+ "backbone.layers.53.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
500
+ "backbone.layers.53.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
501
+ "backbone.layers.53.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
502
+ "backbone.layers.53.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
503
+ "backbone.layers.53.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
504
+ "backbone.layers.53.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
505
+ "backbone.layers.53.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
506
+ "backbone.layers.53.norm.weight": "model-00003-of-00003.safetensors",
507
+ "backbone.layers.54.mixer.A_log": "model-00003-of-00003.safetensors",
508
+ "backbone.layers.54.mixer.D": "model-00003-of-00003.safetensors",
509
+ "backbone.layers.54.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
510
+ "backbone.layers.54.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
511
+ "backbone.layers.54.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
512
+ "backbone.layers.54.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
513
+ "backbone.layers.54.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
514
+ "backbone.layers.54.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
515
+ "backbone.layers.54.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
516
+ "backbone.layers.54.norm.weight": "model-00003-of-00003.safetensors",
517
+ "backbone.layers.55.mixer.A_log": "model-00003-of-00003.safetensors",
518
+ "backbone.layers.55.mixer.D": "model-00003-of-00003.safetensors",
519
+ "backbone.layers.55.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
520
+ "backbone.layers.55.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
521
+ "backbone.layers.55.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
522
+ "backbone.layers.55.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
523
+ "backbone.layers.55.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
524
+ "backbone.layers.55.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
525
+ "backbone.layers.55.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
526
+ "backbone.layers.55.norm.weight": "model-00003-of-00003.safetensors",
527
+ "backbone.layers.56.mixer.A_log": "model-00003-of-00003.safetensors",
528
+ "backbone.layers.56.mixer.D": "model-00003-of-00003.safetensors",
529
+ "backbone.layers.56.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
530
+ "backbone.layers.56.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
531
+ "backbone.layers.56.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
532
+ "backbone.layers.56.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
533
+ "backbone.layers.56.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
534
+ "backbone.layers.56.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
535
+ "backbone.layers.56.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
536
+ "backbone.layers.56.norm.weight": "model-00003-of-00003.safetensors",
537
+ "backbone.layers.57.mixer.A_log": "model-00003-of-00003.safetensors",
538
+ "backbone.layers.57.mixer.D": "model-00003-of-00003.safetensors",
539
+ "backbone.layers.57.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
540
+ "backbone.layers.57.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
541
+ "backbone.layers.57.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
542
+ "backbone.layers.57.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
543
+ "backbone.layers.57.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
544
+ "backbone.layers.57.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
545
+ "backbone.layers.57.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
546
+ "backbone.layers.57.norm.weight": "model-00003-of-00003.safetensors",
547
+ "backbone.layers.58.mixer.A_log": "model-00003-of-00003.safetensors",
548
+ "backbone.layers.58.mixer.D": "model-00003-of-00003.safetensors",
549
+ "backbone.layers.58.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
550
+ "backbone.layers.58.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
551
+ "backbone.layers.58.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
552
+ "backbone.layers.58.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
553
+ "backbone.layers.58.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
554
+ "backbone.layers.58.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
555
+ "backbone.layers.58.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
556
+ "backbone.layers.58.norm.weight": "model-00003-of-00003.safetensors",
557
+ "backbone.layers.59.mixer.A_log": "model-00003-of-00003.safetensors",
558
+ "backbone.layers.59.mixer.D": "model-00003-of-00003.safetensors",
559
+ "backbone.layers.59.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
560
+ "backbone.layers.59.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
561
+ "backbone.layers.59.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
562
+ "backbone.layers.59.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
563
+ "backbone.layers.59.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
564
+ "backbone.layers.59.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
565
+ "backbone.layers.59.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
566
+ "backbone.layers.59.norm.weight": "model-00003-of-00003.safetensors",
567
+ "backbone.layers.6.mixer.A_log": "model-00001-of-00003.safetensors",
568
+ "backbone.layers.6.mixer.D": "model-00001-of-00003.safetensors",
569
+ "backbone.layers.6.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
570
+ "backbone.layers.6.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
571
+ "backbone.layers.6.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
572
+ "backbone.layers.6.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
573
+ "backbone.layers.6.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
574
+ "backbone.layers.6.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
575
+ "backbone.layers.6.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
576
+ "backbone.layers.6.norm.weight": "model-00001-of-00003.safetensors",
577
+ "backbone.layers.60.mixer.A_log": "model-00003-of-00003.safetensors",
578
+ "backbone.layers.60.mixer.D": "model-00003-of-00003.safetensors",
579
+ "backbone.layers.60.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
580
+ "backbone.layers.60.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
581
+ "backbone.layers.60.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
582
+ "backbone.layers.60.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
583
+ "backbone.layers.60.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
584
+ "backbone.layers.60.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
585
+ "backbone.layers.60.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
586
+ "backbone.layers.60.norm.weight": "model-00003-of-00003.safetensors",
587
+ "backbone.layers.61.mixer.A_log": "model-00003-of-00003.safetensors",
588
+ "backbone.layers.61.mixer.D": "model-00003-of-00003.safetensors",
589
+ "backbone.layers.61.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
590
+ "backbone.layers.61.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
591
+ "backbone.layers.61.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
592
+ "backbone.layers.61.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
593
+ "backbone.layers.61.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
594
+ "backbone.layers.61.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
595
+ "backbone.layers.61.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
596
+ "backbone.layers.61.norm.weight": "model-00003-of-00003.safetensors",
597
+ "backbone.layers.62.mixer.A_log": "model-00003-of-00003.safetensors",
598
+ "backbone.layers.62.mixer.D": "model-00003-of-00003.safetensors",
599
+ "backbone.layers.62.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
600
+ "backbone.layers.62.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
601
+ "backbone.layers.62.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
602
+ "backbone.layers.62.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
603
+ "backbone.layers.62.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
604
+ "backbone.layers.62.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
605
+ "backbone.layers.62.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
606
+ "backbone.layers.62.norm.weight": "model-00003-of-00003.safetensors",
607
+ "backbone.layers.63.mixer.A_log": "model-00003-of-00003.safetensors",
608
+ "backbone.layers.63.mixer.D": "model-00003-of-00003.safetensors",
609
+ "backbone.layers.63.mixer.conv1d.bias": "model-00003-of-00003.safetensors",
610
+ "backbone.layers.63.mixer.conv1d.weight": "model-00003-of-00003.safetensors",
611
+ "backbone.layers.63.mixer.dt_proj.bias": "model-00003-of-00003.safetensors",
612
+ "backbone.layers.63.mixer.dt_proj.weight": "model-00003-of-00003.safetensors",
613
+ "backbone.layers.63.mixer.in_proj.weight": "model-00003-of-00003.safetensors",
614
+ "backbone.layers.63.mixer.out_proj.weight": "model-00003-of-00003.safetensors",
615
+ "backbone.layers.63.mixer.x_proj.weight": "model-00003-of-00003.safetensors",
616
+ "backbone.layers.63.norm.weight": "model-00003-of-00003.safetensors",
617
+ "backbone.layers.7.mixer.A_log": "model-00001-of-00003.safetensors",
618
+ "backbone.layers.7.mixer.D": "model-00001-of-00003.safetensors",
619
+ "backbone.layers.7.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
620
+ "backbone.layers.7.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
621
+ "backbone.layers.7.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
622
+ "backbone.layers.7.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
623
+ "backbone.layers.7.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
624
+ "backbone.layers.7.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
625
+ "backbone.layers.7.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
626
+ "backbone.layers.7.norm.weight": "model-00001-of-00003.safetensors",
627
+ "backbone.layers.8.mixer.A_log": "model-00001-of-00003.safetensors",
628
+ "backbone.layers.8.mixer.D": "model-00001-of-00003.safetensors",
629
+ "backbone.layers.8.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
630
+ "backbone.layers.8.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
631
+ "backbone.layers.8.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
632
+ "backbone.layers.8.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
633
+ "backbone.layers.8.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
634
+ "backbone.layers.8.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
635
+ "backbone.layers.8.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
636
+ "backbone.layers.8.norm.weight": "model-00001-of-00003.safetensors",
637
+ "backbone.layers.9.mixer.A_log": "model-00001-of-00003.safetensors",
638
+ "backbone.layers.9.mixer.D": "model-00001-of-00003.safetensors",
639
+ "backbone.layers.9.mixer.conv1d.bias": "model-00001-of-00003.safetensors",
640
+ "backbone.layers.9.mixer.conv1d.weight": "model-00001-of-00003.safetensors",
641
+ "backbone.layers.9.mixer.dt_proj.bias": "model-00001-of-00003.safetensors",
642
+ "backbone.layers.9.mixer.dt_proj.weight": "model-00001-of-00003.safetensors",
643
+ "backbone.layers.9.mixer.in_proj.weight": "model-00001-of-00003.safetensors",
644
+ "backbone.layers.9.mixer.out_proj.weight": "model-00001-of-00003.safetensors",
645
+ "backbone.layers.9.mixer.x_proj.weight": "model-00001-of-00003.safetensors",
646
+ "backbone.layers.9.norm.weight": "model-00001-of-00003.safetensors",
647
+ "backbone.norm_f.weight": "model-00003-of-00003.safetensors",
648
+ "lm_head.weight": "model-00003-of-00003.safetensors"
649
+ }
650
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ ">>TITLE<<",
4
+ ">>ABSTRACT<<",
5
+ ">>INTRODUCTION<<",
6
+ ">>SUMMARY<<",
7
+ ">>COMMENT<<",
8
+ ">>ANSWER<<",
9
+ ">>QUESTION<<",
10
+ "assistant",
11
+ "<|begin_of_text|>",
12
+ "<|im_start|>",
13
+ "<|im_end|>"
14
+ ],
15
+ "bos_token": {
16
+ "content": "<|begin_of_text|>",
17
+ "lstrip": false,
18
+ "normalized": false,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "eos_token": {
23
+ "content": "<|end_of_text|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false
28
+ },
29
+ "pad_token": "<|end_of_text|>"
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": ">>TITLE<<",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": ">>ABSTRACT<<",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": ">>INTRODUCTION<<",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": ">>SUMMARY<<",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": ">>COMMENT<<",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": ">>ANSWER<<",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": ">>QUESTION<<",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "assistant",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<|begin_of_text|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<|im_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<|im_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<|end_of_text|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ }
100
+ },
101
+ "additional_special_tokens": [
102
+ ">>TITLE<<",
103
+ ">>ABSTRACT<<",
104
+ ">>INTRODUCTION<<",
105
+ ">>SUMMARY<<",
106
+ ">>COMMENT<<",
107
+ ">>ANSWER<<",
108
+ ">>QUESTION<<",
109
+ "assistant",
110
+ "<|begin_of_text|>",
111
+ "<|im_start|>",
112
+ "<|im_end|>"
113
+ ],
114
+ "bos_token": "<|begin_of_text|>",
115
+ "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
116
+ "clean_up_tokenization_spaces": true,
117
+ "eos_token": "<|end_of_text|>",
118
+ "extra_special_tokens": {},
119
+ "max_length": null,
120
+ "model_input_names": [
121
+ "input_ids",
122
+ "attention_mask"
123
+ ],
124
+ "model_max_length": 1000000000000000019884624838656,
125
+ "pad_to_multiple_of": null,
126
+ "pad_token": "<|end_of_text|>",
127
+ "pad_token_type_id": 0,
128
+ "padding_side": "left",
129
+ "tokenizer_class": "PreTrainedTokenizer"
130
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 3.616227566899167e+18,
3
+ "train_loss": 16.595592213948567,
4
+ "train_runtime": 29081.677,
5
+ "train_samples": 16610,
6
+ "train_samples_per_second": 0.827,
7
+ "train_steps_per_second": 0.013
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.998003992015968,
6
+ "eval_steps": 50,
7
+ "global_step": 375,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.01330671989354624,
14
+ "grad_norm": 38.0893895691,
15
+ "learning_rate": 2.631578947368421e-06,
16
+ "loss": 9.4719,
17
+ "mean_token_accuracy": 0.6992475613951683,
18
+ "step": 5
19
+ },
20
+ {
21
+ "epoch": 0.02661343978709248,
22
+ "grad_norm": 53.38062892110183,
23
+ "learning_rate": 5.263157894736842e-06,
24
+ "loss": 9.2617,
25
+ "mean_token_accuracy": 0.7013110458850861,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 0.03992015968063872,
30
+ "grad_norm": 65.59379133263475,
31
+ "learning_rate": 7.894736842105265e-06,
32
+ "loss": 8.8052,
33
+ "mean_token_accuracy": 0.7065529704093934,
34
+ "step": 15
35
+ },
36
+ {
37
+ "epoch": 0.05322687957418496,
38
+ "grad_norm": 30.823233386013126,
39
+ "learning_rate": 1.0526315789473684e-05,
40
+ "loss": 8.1585,
41
+ "mean_token_accuracy": 0.719629879295826,
42
+ "step": 20
43
+ },
44
+ {
45
+ "epoch": 0.0665335994677312,
46
+ "grad_norm": 12.66541979902211,
47
+ "learning_rate": 1.3157894736842108e-05,
48
+ "loss": 7.3748,
49
+ "mean_token_accuracy": 0.740014499425888,
50
+ "step": 25
51
+ },
52
+ {
53
+ "epoch": 0.07984031936127745,
54
+ "grad_norm": 10.709230045785015,
55
+ "learning_rate": 1.578947368421053e-05,
56
+ "loss": 7.1481,
57
+ "mean_token_accuracy": 0.7436378166079521,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.09314703925482369,
62
+ "grad_norm": 7.432960167563066,
63
+ "learning_rate": 1.8421052631578947e-05,
64
+ "loss": 6.9424,
65
+ "mean_token_accuracy": 0.7486263796687126,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 0.10645375914836992,
70
+ "grad_norm": 10.045989605681791,
71
+ "learning_rate": 1.9998261969639324e-05,
72
+ "loss": 6.9028,
73
+ "mean_token_accuracy": 0.7475503668189049,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 0.11976047904191617,
78
+ "grad_norm": 15.965640756107303,
79
+ "learning_rate": 1.9978716065702566e-05,
80
+ "loss": 7.2718,
81
+ "mean_token_accuracy": 0.7349641278386116,
82
+ "step": 45
83
+ },
84
+ {
85
+ "epoch": 0.1330671989354624,
86
+ "grad_norm": 27.77458817629002,
87
+ "learning_rate": 1.9937494319239112e-05,
88
+ "loss": 7.3623,
89
+ "mean_token_accuracy": 0.7310615047812462,
90
+ "step": 50
91
+ },
92
+ {
93
+ "epoch": 0.1330671989354624,
94
+ "eval_loss": 1.008537769317627,
95
+ "eval_mean_token_accuracy": 0.7182760106192695,
96
+ "eval_runtime": 42.3882,
97
+ "eval_samples_per_second": 3.397,
98
+ "eval_steps_per_second": 0.425,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.14637391882900866,
103
+ "grad_norm": 30.81470758273484,
104
+ "learning_rate": 1.9874686272438467e-05,
105
+ "loss": 7.2943,
106
+ "mean_token_accuracy": 0.7337011635303498,
107
+ "step": 55
108
+ },
109
+ {
110
+ "epoch": 0.1596806387225549,
111
+ "grad_norm": 22.17371294352614,
112
+ "learning_rate": 1.979042835741503e-05,
113
+ "loss": 7.104,
114
+ "mean_token_accuracy": 0.7383959114551544,
115
+ "step": 60
116
+ },
117
+ {
118
+ "epoch": 0.17298735861610112,
119
+ "grad_norm": 27.90713548690936,
120
+ "learning_rate": 1.968490359984923e-05,
121
+ "loss": 7.1833,
122
+ "mean_token_accuracy": 0.7363018915057182,
123
+ "step": 65
124
+ },
125
+ {
126
+ "epoch": 0.18629407850964738,
127
+ "grad_norm": 9.001486401205879,
128
+ "learning_rate": 1.9558341221417744e-05,
129
+ "loss": 7.05,
130
+ "mean_token_accuracy": 0.740586844086647,
131
+ "step": 70
132
+ },
133
+ {
134
+ "epoch": 0.1996007984031936,
135
+ "grad_norm": 13.898951307519155,
136
+ "learning_rate": 1.9411016141876438e-05,
137
+ "loss": 7.0786,
138
+ "mean_token_accuracy": 0.7393299728631973,
139
+ "step": 75
140
+ },
141
+ {
142
+ "epoch": 0.21290751829673984,
143
+ "grad_norm": 44.17413162889863,
144
+ "learning_rate": 1.9243248381877605e-05,
145
+ "loss": 7.513,
146
+ "mean_token_accuracy": 0.7232646465301513,
147
+ "step": 80
148
+ },
149
+ {
150
+ "epoch": 0.2262142381902861,
151
+ "grad_norm": 43.276281867208816,
152
+ "learning_rate": 1.9055402367818673e-05,
153
+ "loss": 7.2214,
154
+ "mean_token_accuracy": 0.7344184964895248,
155
+ "step": 85
156
+ },
157
+ {
158
+ "epoch": 0.23952095808383234,
159
+ "grad_norm": 22.007621479836395,
160
+ "learning_rate": 1.8847886140232438e-05,
161
+ "loss": 7.1625,
162
+ "mean_token_accuracy": 0.735144229233265,
163
+ "step": 90
164
+ },
165
+ {
166
+ "epoch": 0.2528276779773786,
167
+ "grad_norm": 77.4740488466291,
168
+ "learning_rate": 1.862115046743831e-05,
169
+ "loss": 7.5932,
170
+ "mean_token_accuracy": 0.722845695912838,
171
+ "step": 95
172
+ },
173
+ {
174
+ "epoch": 0.2661343978709248,
175
+ "grad_norm": 1066.7215003063964,
176
+ "learning_rate": 1.8375687866379988e-05,
177
+ "loss": 7.4423,
178
+ "mean_token_accuracy": 0.7269001781940461,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.2661343978709248,
183
+ "eval_loss": 1.0586838722229004,
184
+ "eval_mean_token_accuracy": 0.6958943770991431,
185
+ "eval_runtime": 42.4569,
186
+ "eval_samples_per_second": 3.392,
187
+ "eval_steps_per_second": 0.424,
188
+ "step": 100
189
+ },
190
+ {
191
+ "epoch": 0.27944111776447106,
192
+ "grad_norm": 923.4336165050305,
193
+ "learning_rate": 1.811203153277641e-05,
194
+ "loss": 8.4501,
195
+ "mean_token_accuracy": 0.6929014056921006,
196
+ "step": 105
197
+ },
198
+ {
199
+ "epoch": 0.2927478376580173,
200
+ "grad_norm": 10253.700005495137,
201
+ "learning_rate": 1.7830754182909985e-05,
202
+ "loss": 11.581,
203
+ "mean_token_accuracy": 0.6142643451690674,
204
+ "step": 110
205
+ },
206
+ {
207
+ "epoch": 0.3060545575515635,
208
+ "grad_norm": 11508.216656593022,
209
+ "learning_rate": 1.753246680956795e-05,
210
+ "loss": 15.7105,
211
+ "mean_token_accuracy": 0.5149824447929859,
212
+ "step": 115
213
+ },
214
+ {
215
+ "epoch": 0.3193612774451098,
216
+ "grad_norm": 6036.292381336853,
217
+ "learning_rate": 1.721781735483921e-05,
218
+ "loss": 26.2876,
219
+ "mean_token_accuracy": 0.33059981614351275,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 0.33266799733865604,
224
+ "grad_norm": 26623.625140159445,
225
+ "learning_rate": 1.6887489302649657e-05,
226
+ "loss": 30.2414,
227
+ "mean_token_accuracy": 0.26836080476641655,
228
+ "step": 125
229
+ },
230
+ {
231
+ "epoch": 0.34597471723220224,
232
+ "grad_norm": 213666.71326672958,
233
+ "learning_rate": 1.654220019409317e-05,
234
+ "loss": 36.7917,
235
+ "mean_token_accuracy": 0.20126449912786484,
236
+ "step": 130
237
+ },
238
+ {
239
+ "epoch": 0.3592814371257485,
240
+ "grad_norm": 281204.55298403726,
241
+ "learning_rate": 1.6182700068783463e-05,
242
+ "loss": 53.894,
243
+ "mean_token_accuracy": 0.08266483591869474,
244
+ "step": 135
245
+ },
246
+ {
247
+ "epoch": 0.37258815701929476,
248
+ "grad_norm": 274791.48096197617,
249
+ "learning_rate": 1.580976983561235e-05,
250
+ "loss": 58.3125,
251
+ "mean_token_accuracy": 0.06163843311369419,
252
+ "step": 140
253
+ },
254
+ {
255
+ "epoch": 0.38589487691284097,
256
+ "grad_norm": 9564.75792140504,
257
+ "learning_rate": 1.5424219576453526e-05,
258
+ "loss": 45.3478,
259
+ "mean_token_accuracy": 0.12734813932329417,
260
+ "step": 145
261
+ },
262
+ {
263
+ "epoch": 0.3992015968063872,
264
+ "grad_norm": 6120.107227530501,
265
+ "learning_rate": 1.5026886786496624e-05,
266
+ "loss": 42.2261,
267
+ "mean_token_accuracy": 0.1591544572263956,
268
+ "step": 150
269
+ },
270
+ {
271
+ "epoch": 0.3992015968063872,
272
+ "eval_loss": 4.92734432220459,
273
+ "eval_mean_token_accuracy": 0.1774691359864341,
274
+ "eval_runtime": 42.2395,
275
+ "eval_samples_per_second": 3.409,
276
+ "eval_steps_per_second": 0.426,
277
+ "step": 150
278
+ },
279
+ {
280
+ "epoch": 0.4125083166999335,
281
+ "grad_norm": 13441.595281708549,
282
+ "learning_rate": 1.46186345550338e-05,
283
+ "loss": 32.8561,
284
+ "mean_token_accuracy": 0.23304792679846287,
285
+ "step": 155
286
+ },
287
+ {
288
+ "epoch": 0.4258150365934797,
289
+ "grad_norm": 4366.951799070855,
290
+ "learning_rate": 1.4200349690650654e-05,
291
+ "loss": 26.1181,
292
+ "mean_token_accuracy": 0.3177220694720745,
293
+ "step": 160
294
+ },
295
+ {
296
+ "epoch": 0.43912175648702595,
297
+ "grad_norm": 8486.995137743008,
298
+ "learning_rate": 1.3772940794893916e-05,
299
+ "loss": 28.5985,
300
+ "mean_token_accuracy": 0.28858516551554203,
301
+ "step": 165
302
+ },
303
+ {
304
+ "epoch": 0.4524284763805722,
305
+ "grad_norm": 4760.402708498517,
306
+ "learning_rate": 1.3337336288600297e-05,
307
+ "loss": 24.7618,
308
+ "mean_token_accuracy": 0.3370695985853672,
309
+ "step": 170
310
+ },
311
+ {
312
+ "epoch": 0.4657351962741184,
313
+ "grad_norm": 1237.143262994326,
314
+ "learning_rate": 1.2894482395173695e-05,
315
+ "loss": 17.015,
316
+ "mean_token_accuracy": 0.4780235022306442,
317
+ "step": 175
318
+ },
319
+ {
320
+ "epoch": 0.47904191616766467,
321
+ "grad_norm": 408.8332305447992,
322
+ "learning_rate": 1.24453410851916e-05,
323
+ "loss": 15.019,
324
+ "mean_token_accuracy": 0.516390411555767,
325
+ "step": 180
326
+ },
327
+ {
328
+ "epoch": 0.49234863606121093,
329
+ "grad_norm": 311.65890181237427,
330
+ "learning_rate": 1.1990887986805295e-05,
331
+ "loss": 13.0538,
332
+ "mean_token_accuracy": 0.5649969473481178,
333
+ "step": 185
334
+ },
335
+ {
336
+ "epoch": 0.5056553559547572,
337
+ "grad_norm": 218.78464454962847,
338
+ "learning_rate": 1.1532110266473026e-05,
339
+ "loss": 11.4017,
340
+ "mean_token_accuracy": 0.6076564386487007,
341
+ "step": 190
342
+ },
343
+ {
344
+ "epoch": 0.5189620758483033,
345
+ "grad_norm": 248.3901525774538,
346
+ "learning_rate": 1.1070004484629543e-05,
347
+ "loss": 10.3675,
348
+ "mean_token_accuracy": 0.6390743300318718,
349
+ "step": 195
350
+ },
351
+ {
352
+ "epoch": 0.5322687957418496,
353
+ "grad_norm": 68.79121302231147,
354
+ "learning_rate": 1.0605574430949983e-05,
355
+ "loss": 9.2733,
356
+ "mean_token_accuracy": 0.6695673123002053,
357
+ "step": 200
358
+ },
359
+ {
360
+ "epoch": 0.5322687957418496,
361
+ "eval_loss": 1.2745658159255981,
362
+ "eval_mean_token_accuracy": 0.6473477118545108,
363
+ "eval_runtime": 42.3799,
364
+ "eval_samples_per_second": 3.398,
365
+ "eval_steps_per_second": 0.425,
366
+ "step": 200
367
+ },
368
+ {
369
+ "epoch": 0.5455755156353959,
370
+ "grad_norm": 61.494613995295225,
371
+ "learning_rate": 1.0139828943910358e-05,
372
+ "loss": 8.6282,
373
+ "mean_token_accuracy": 0.6900610521435737,
374
+ "step": 205
375
+ },
376
+ {
377
+ "epoch": 0.5588822355289421,
378
+ "grad_norm": 223.47361816320114,
379
+ "learning_rate": 9.673779719380967e-06,
380
+ "loss": 8.8734,
381
+ "mean_token_accuracy": 0.6839690148830414,
382
+ "step": 210
383
+ },
384
+ {
385
+ "epoch": 0.5721889554224884,
386
+ "grad_norm": 311.485197048925,
387
+ "learning_rate": 9.208439113012984e-06,
388
+ "loss": 9.6346,
389
+ "mean_token_accuracy": 0.6596978038549424,
390
+ "step": 215
391
+ },
392
+ {
393
+ "epoch": 0.5854956753160346,
394
+ "grad_norm": 208.95277149419533,
395
+ "learning_rate": 8.744817941191862e-06,
396
+ "loss": 9.8742,
397
+ "mean_token_accuracy": 0.6521520212292671,
398
+ "step": 220
399
+ },
400
+ {
401
+ "epoch": 0.5988023952095808,
402
+ "grad_norm": 99.55141840914388,
403
+ "learning_rate": 8.283923285334304e-06,
404
+ "loss": 10.0645,
405
+ "mean_token_accuracy": 0.6457211509346962,
406
+ "step": 225
407
+ },
408
+ {
409
+ "epoch": 0.612109115103127,
410
+ "grad_norm": 113.01850407325877,
411
+ "learning_rate": 7.826756304298428e-06,
412
+ "loss": 9.6991,
413
+ "mean_token_accuracy": 0.6567307710647583,
414
+ "step": 230
415
+ },
416
+ {
417
+ "epoch": 0.6254158349966733,
418
+ "grad_norm": 122.00372512450222,
419
+ "learning_rate": 7.3743100596589e-06,
420
+ "loss": 9.3977,
421
+ "mean_token_accuracy": 0.6660859316587449,
422
+ "step": 235
423
+ },
424
+ {
425
+ "epoch": 0.6387225548902196,
426
+ "grad_norm": 141.28653030104314,
427
+ "learning_rate": 6.92756735857107e-06,
428
+ "loss": 9.916,
429
+ "mean_token_accuracy": 0.6507371798157692,
430
+ "step": 240
431
+ },
432
+ {
433
+ "epoch": 0.6520292747837658,
434
+ "grad_norm": 807.7008160726642,
435
+ "learning_rate": 6.487498618909845e-06,
436
+ "loss": 9.8794,
437
+ "mean_token_accuracy": 0.6521420940756798,
438
+ "step": 245
439
+ },
440
+ {
441
+ "epoch": 0.6653359946773121,
442
+ "grad_norm": 12179.703084325536,
443
+ "learning_rate": 6.0550597613206205e-06,
444
+ "loss": 10.3914,
445
+ "mean_token_accuracy": 0.6389522299170494,
446
+ "step": 250
447
+ },
448
+ {
449
+ "epoch": 0.6653359946773121,
450
+ "eval_loss": 1.7774240970611572,
451
+ "eval_mean_token_accuracy": 0.5449769298235575,
452
+ "eval_runtime": 42.2375,
453
+ "eval_samples_per_second": 3.409,
454
+ "eval_steps_per_second": 0.426,
455
+ "step": 250
456
+ },
457
+ {
458
+ "epoch": 0.6786427145708582,
459
+ "grad_norm": 1054.76927310761,
460
+ "learning_rate": 5.631190132761247e-06,
461
+ "loss": 11.7133,
462
+ "mean_token_accuracy": 0.5997588485479355,
463
+ "step": 255
464
+ },
465
+ {
466
+ "epoch": 0.6919494344644045,
467
+ "grad_norm": 1424.9333125346188,
468
+ "learning_rate": 5.216810466045448e-06,
469
+ "loss": 12.5735,
470
+ "mean_token_accuracy": 0.5747481673955918,
471
+ "step": 260
472
+ },
473
+ {
474
+ "epoch": 0.7052561543579507,
475
+ "grad_norm": 798.7973283165478,
476
+ "learning_rate": 4.812820879820034e-06,
477
+ "loss": 13.4974,
478
+ "mean_token_accuracy": 0.5521100461483002,
479
+ "step": 265
480
+ },
481
+ {
482
+ "epoch": 0.718562874251497,
483
+ "grad_norm": 1790.9039210107235,
484
+ "learning_rate": 4.420098923320378e-06,
485
+ "loss": 14.4898,
486
+ "mean_token_accuracy": 0.5296176724135876,
487
+ "step": 270
488
+ },
489
+ {
490
+ "epoch": 0.7318695941450433,
491
+ "grad_norm": 829.5583452937811,
492
+ "learning_rate": 4.0394976701513235e-06,
493
+ "loss": 14.5873,
494
+ "mean_token_accuracy": 0.5268749997019768,
495
+ "step": 275
496
+ },
497
+ {
498
+ "epoch": 0.7451763140385895,
499
+ "grad_norm": 6150.368162782374,
500
+ "learning_rate": 3.671843865234238e-06,
501
+ "loss": 14.6091,
502
+ "mean_token_accuracy": 0.5281791850924492,
503
+ "step": 280
504
+ },
505
+ {
506
+ "epoch": 0.7584830339321357,
507
+ "grad_norm": 29207.957135047905,
508
+ "learning_rate": 3.3179361289454694e-06,
509
+ "loss": 16.9682,
510
+ "mean_token_accuracy": 0.48243742287158964,
511
+ "step": 285
512
+ },
513
+ {
514
+ "epoch": 0.7717897538256819,
515
+ "grad_norm": 54201.045662727825,
516
+ "learning_rate": 2.978543222347076e-06,
517
+ "loss": 20.7529,
518
+ "mean_token_accuracy": 0.4138728640973568,
519
+ "step": 290
520
+ },
521
+ {
522
+ "epoch": 0.7850964737192282,
523
+ "grad_norm": 23857.53870983074,
524
+ "learning_rate": 2.6544023772782736e-06,
525
+ "loss": 21.3047,
526
+ "mean_token_accuracy": 0.403152472525835,
527
+ "step": 295
528
+ },
529
+ {
530
+ "epoch": 0.7984031936127745,
531
+ "grad_norm": 11423.97832396548,
532
+ "learning_rate": 2.346217694934847e-06,
533
+ "loss": 20.4021,
534
+ "mean_token_accuracy": 0.41206730976700784,
535
+ "step": 300
536
+ },
537
+ {
538
+ "epoch": 0.7984031936127745,
539
+ "eval_loss": 3.0904834270477295,
540
+ "eval_mean_token_accuracy": 0.34269434379206765,
541
+ "eval_runtime": 42.4109,
542
+ "eval_samples_per_second": 3.395,
543
+ "eval_steps_per_second": 0.424,
544
+ "step": 300
545
+ },
546
+ {
547
+ "epoch": 0.8117099135063207,
548
+ "grad_norm": 6555.658931606807,
549
+ "learning_rate": 2.0546586164151827e-06,
550
+ "loss": 19.3343,
551
+ "mean_token_accuracy": 0.42890567928552625,
552
+ "step": 305
553
+ },
554
+ {
555
+ "epoch": 0.825016633399867,
556
+ "grad_norm": 8596.180267406271,
557
+ "learning_rate": 1.7803584685552877e-06,
558
+ "loss": 19.1283,
559
+ "mean_token_accuracy": 0.4296626977622509,
560
+ "step": 310
561
+ },
562
+ {
563
+ "epoch": 0.8383233532934131,
564
+ "grad_norm": 7559.729843914057,
565
+ "learning_rate": 1.523913088211415e-06,
566
+ "loss": 19.9312,
567
+ "mean_token_accuracy": 0.4130441091954708,
568
+ "step": 315
569
+ },
570
+ {
571
+ "epoch": 0.8516300731869594,
572
+ "grad_norm": 6283.0641437588,
573
+ "learning_rate": 1.2858795279787517e-06,
574
+ "loss": 20.0128,
575
+ "mean_token_accuracy": 0.40999465957283976,
576
+ "step": 320
577
+ },
578
+ {
579
+ "epoch": 0.8649367930805056,
580
+ "grad_norm": 4627.199331551124,
581
+ "learning_rate": 1.0667748461575544e-06,
582
+ "loss": 20.2021,
583
+ "mean_token_accuracy": 0.40563797727227213,
584
+ "step": 325
585
+ },
586
+ {
587
+ "epoch": 0.8782435129740519,
588
+ "grad_norm": 3896.885861785845,
589
+ "learning_rate": 8.670749835951964e-07,
590
+ "loss": 20.3633,
591
+ "mean_token_accuracy": 0.40211157202720643,
592
+ "step": 330
593
+ },
594
+ {
595
+ "epoch": 0.8915502328675982,
596
+ "grad_norm": 3649.4556728802945,
597
+ "learning_rate": 6.872137298438653e-07,
598
+ "loss": 20.6856,
599
+ "mean_token_accuracy": 0.39633470848202706,
600
+ "step": 335
601
+ },
602
+ {
603
+ "epoch": 0.9048569527611444,
604
+ "grad_norm": 5894.774814516487,
605
+ "learning_rate": 5.275817808796013e-07,
606
+ "loss": 21.1202,
607
+ "mean_token_accuracy": 0.38578067943453787,
608
+ "step": 340
609
+ },
610
+ {
611
+ "epoch": 0.9181636726546906,
612
+ "grad_norm": 4756.426577271269,
613
+ "learning_rate": 3.885258904295575e-07,
614
+ "loss": 21.8543,
615
+ "mean_token_accuracy": 0.37522283270955087,
616
+ "step": 345
617
+ },
618
+ {
619
+ "epoch": 0.9314703925482368,
620
+ "grad_norm": 11889.44146408838,
621
+ "learning_rate": 2.703481167509281e-07,
622
+ "loss": 22.5845,
623
+ "mean_token_accuracy": 0.36378281489014624,
624
+ "step": 350
625
+ },
626
+ {
627
+ "epoch": 0.9314703925482368,
628
+ "eval_loss": 3.6812663078308105,
629
+ "eval_mean_token_accuracy": 0.2753951284620497,
630
+ "eval_runtime": 42.3376,
631
+ "eval_samples_per_second": 3.401,
632
+ "eval_steps_per_second": 0.425,
633
+ "step": 350
634
+ },
635
+ {
636
+ "epoch": 0.9447771124417831,
637
+ "grad_norm": 6803.573866074052,
638
+ "learning_rate": 1.73305166497707e-07,
639
+ "loss": 22.5139,
640
+ "mean_token_accuracy": 0.36300976797938345,
641
+ "step": 355
642
+ },
643
+ {
644
+ "epoch": 0.9580838323353293,
645
+ "grad_norm": 7457.921947626093,
646
+ "learning_rate": 9.760783710056176e-08,
647
+ "loss": 22.5036,
648
+ "mean_token_accuracy": 0.3649313189089298,
649
+ "step": 360
650
+ },
651
+ {
652
+ "epoch": 0.9713905522288756,
653
+ "grad_norm": 8489.33440106548,
654
+ "learning_rate": 4.3420558871060116e-08,
655
+ "loss": 22.7636,
656
+ "mean_token_accuracy": 0.35811431556940077,
657
+ "step": 365
658
+ },
659
+ {
660
+ "epoch": 0.9846972721224219,
661
+ "grad_norm": 10404.8253118789,
662
+ "learning_rate": 1.0861037824896337e-08,
663
+ "loss": 22.8268,
664
+ "mean_token_accuracy": 0.3570818044245243,
665
+ "step": 370
666
+ },
667
+ {
668
+ "epoch": 0.998003992015968,
669
+ "grad_norm": 7227.984198697766,
670
+ "learning_rate": 0.0,
671
+ "loss": 22.9385,
672
+ "mean_token_accuracy": 0.3560729533433914,
673
+ "step": 375
674
+ },
675
+ {
676
+ "epoch": 0.998003992015968,
677
+ "step": 375,
678
+ "total_flos": 3.616227566899167e+18,
679
+ "train_loss": 16.595592213948567,
680
+ "train_runtime": 29081.677,
681
+ "train_samples_per_second": 0.827,
682
+ "train_steps_per_second": 0.013
683
+ }
684
+ ],
685
+ "logging_steps": 5,
686
+ "max_steps": 375,
687
+ "num_input_tokens_seen": 0,
688
+ "num_train_epochs": 1,
689
+ "save_steps": 500,
690
+ "stateful_callbacks": {
691
+ "TrainerControl": {
692
+ "args": {
693
+ "should_epoch_stop": false,
694
+ "should_evaluate": false,
695
+ "should_log": false,
696
+ "should_save": false,
697
+ "should_training_stop": false
698
+ },
699
+ "attributes": {}
700
+ }
701
+ },
702
+ "total_flos": 3.616227566899167e+18,
703
+ "train_batch_size": 2,
704
+ "trial_name": null,
705
+ "trial_params": null
706
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbe9478d8f56b22912ff9b43a0ccc0520a1d9f8aa69517ceb37f58ab500290ce
3
+ size 7288