masani commited on
Commit
1dbee27
Β·
verified Β·
1 Parent(s): 8931f68

End of training

Browse files
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  base_model: openai-community/gpt2-xl
3
  library_name: transformers
4
- model_name: 'gpt2-xl-gsm8k-epoch4-acc0-1. Always '
5
  tags:
6
  - generated_from_trainer
7
  - trl
@@ -9,7 +9,7 @@ tags:
9
  licence: license
10
  ---
11
 
12
- # Model Card for gpt2-xl-gsm8k-epoch4-acc0-1. Always
13
 
14
  This model is a fine-tuned version of [openai-community/gpt2-xl](https://huggingface.co/openai-community/gpt2-xl).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
 
1
  ---
2
  base_model: openai-community/gpt2-xl
3
  library_name: transformers
4
+ model_name: 'gpt2-xl-gsm8k-epoch5-acc0-1. Always '
5
  tags:
6
  - generated_from_trainer
7
  - trl
 
9
  licence: license
10
  ---
11
 
12
+ # Model Card for gpt2-xl-gsm8k-epoch5-acc0-1. Always
13
 
14
  This model is a fine-tuned version of [openai-community/gpt2-xl](https://huggingface.co/openai-community/gpt2-xl).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
epoch5/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2-xl",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1600,
16
+ "n_head": 25,
17
+ "n_inner": null,
18
+ "n_layer": 48,
19
+ "n_positions": 1024,
20
+ "output_past": true,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.49.0",
38
+ "use_cache": true,
39
+ "vocab_size": 50257
40
+ }
epoch5/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.49.0"
6
+ }
epoch5/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
epoch5/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60639cd96a2f1dcff024ce6dd400ad994e36c740ac7a907b91ebc7879bd87e64
3
+ size 4959881464
epoch5/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba63f1f31de477c7bda8328c29edd9f0e7140e7e81adf8f6432e9a6eb8c04ea
3
+ size 1270624096
epoch5/model.safetensors.index.json ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6230444800
4
+ },
5
+ "weight_map": {
6
+ "transformer.h.0.attn.c_attn.bias": "model-00001-of-00002.safetensors",
7
+ "transformer.h.0.attn.c_attn.weight": "model-00001-of-00002.safetensors",
8
+ "transformer.h.0.attn.c_proj.bias": "model-00001-of-00002.safetensors",
9
+ "transformer.h.0.attn.c_proj.weight": "model-00001-of-00002.safetensors",
10
+ "transformer.h.0.ln_1.bias": "model-00001-of-00002.safetensors",
11
+ "transformer.h.0.ln_1.weight": "model-00001-of-00002.safetensors",
12
+ "transformer.h.0.ln_2.bias": "model-00001-of-00002.safetensors",
13
+ "transformer.h.0.ln_2.weight": "model-00001-of-00002.safetensors",
14
+ "transformer.h.0.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
15
+ "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
16
+ "transformer.h.0.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
17
+ "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
18
+ "transformer.h.1.attn.c_attn.bias": "model-00001-of-00002.safetensors",
19
+ "transformer.h.1.attn.c_attn.weight": "model-00001-of-00002.safetensors",
20
+ "transformer.h.1.attn.c_proj.bias": "model-00001-of-00002.safetensors",
21
+ "transformer.h.1.attn.c_proj.weight": "model-00001-of-00002.safetensors",
22
+ "transformer.h.1.ln_1.bias": "model-00001-of-00002.safetensors",
23
+ "transformer.h.1.ln_1.weight": "model-00001-of-00002.safetensors",
24
+ "transformer.h.1.ln_2.bias": "model-00001-of-00002.safetensors",
25
+ "transformer.h.1.ln_2.weight": "model-00001-of-00002.safetensors",
26
+ "transformer.h.1.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
27
+ "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
28
+ "transformer.h.1.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
29
+ "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
30
+ "transformer.h.10.attn.c_attn.bias": "model-00001-of-00002.safetensors",
31
+ "transformer.h.10.attn.c_attn.weight": "model-00001-of-00002.safetensors",
32
+ "transformer.h.10.attn.c_proj.bias": "model-00001-of-00002.safetensors",
33
+ "transformer.h.10.attn.c_proj.weight": "model-00001-of-00002.safetensors",
34
+ "transformer.h.10.ln_1.bias": "model-00001-of-00002.safetensors",
35
+ "transformer.h.10.ln_1.weight": "model-00001-of-00002.safetensors",
36
+ "transformer.h.10.ln_2.bias": "model-00001-of-00002.safetensors",
37
+ "transformer.h.10.ln_2.weight": "model-00001-of-00002.safetensors",
38
+ "transformer.h.10.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
39
+ "transformer.h.10.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
40
+ "transformer.h.10.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
41
+ "transformer.h.10.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
42
+ "transformer.h.11.attn.c_attn.bias": "model-00001-of-00002.safetensors",
43
+ "transformer.h.11.attn.c_attn.weight": "model-00001-of-00002.safetensors",
44
+ "transformer.h.11.attn.c_proj.bias": "model-00001-of-00002.safetensors",
45
+ "transformer.h.11.attn.c_proj.weight": "model-00001-of-00002.safetensors",
46
+ "transformer.h.11.ln_1.bias": "model-00001-of-00002.safetensors",
47
+ "transformer.h.11.ln_1.weight": "model-00001-of-00002.safetensors",
48
+ "transformer.h.11.ln_2.bias": "model-00001-of-00002.safetensors",
49
+ "transformer.h.11.ln_2.weight": "model-00001-of-00002.safetensors",
50
+ "transformer.h.11.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
51
+ "transformer.h.11.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
52
+ "transformer.h.11.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
53
+ "transformer.h.11.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
54
+ "transformer.h.12.attn.c_attn.bias": "model-00001-of-00002.safetensors",
55
+ "transformer.h.12.attn.c_attn.weight": "model-00001-of-00002.safetensors",
56
+ "transformer.h.12.attn.c_proj.bias": "model-00001-of-00002.safetensors",
57
+ "transformer.h.12.attn.c_proj.weight": "model-00001-of-00002.safetensors",
58
+ "transformer.h.12.ln_1.bias": "model-00001-of-00002.safetensors",
59
+ "transformer.h.12.ln_1.weight": "model-00001-of-00002.safetensors",
60
+ "transformer.h.12.ln_2.bias": "model-00001-of-00002.safetensors",
61
+ "transformer.h.12.ln_2.weight": "model-00001-of-00002.safetensors",
62
+ "transformer.h.12.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
63
+ "transformer.h.12.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
64
+ "transformer.h.12.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
65
+ "transformer.h.12.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
66
+ "transformer.h.13.attn.c_attn.bias": "model-00001-of-00002.safetensors",
67
+ "transformer.h.13.attn.c_attn.weight": "model-00001-of-00002.safetensors",
68
+ "transformer.h.13.attn.c_proj.bias": "model-00001-of-00002.safetensors",
69
+ "transformer.h.13.attn.c_proj.weight": "model-00001-of-00002.safetensors",
70
+ "transformer.h.13.ln_1.bias": "model-00001-of-00002.safetensors",
71
+ "transformer.h.13.ln_1.weight": "model-00001-of-00002.safetensors",
72
+ "transformer.h.13.ln_2.bias": "model-00001-of-00002.safetensors",
73
+ "transformer.h.13.ln_2.weight": "model-00001-of-00002.safetensors",
74
+ "transformer.h.13.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
75
+ "transformer.h.13.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
76
+ "transformer.h.13.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
77
+ "transformer.h.13.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
78
+ "transformer.h.14.attn.c_attn.bias": "model-00001-of-00002.safetensors",
79
+ "transformer.h.14.attn.c_attn.weight": "model-00001-of-00002.safetensors",
80
+ "transformer.h.14.attn.c_proj.bias": "model-00001-of-00002.safetensors",
81
+ "transformer.h.14.attn.c_proj.weight": "model-00001-of-00002.safetensors",
82
+ "transformer.h.14.ln_1.bias": "model-00001-of-00002.safetensors",
83
+ "transformer.h.14.ln_1.weight": "model-00001-of-00002.safetensors",
84
+ "transformer.h.14.ln_2.bias": "model-00001-of-00002.safetensors",
85
+ "transformer.h.14.ln_2.weight": "model-00001-of-00002.safetensors",
86
+ "transformer.h.14.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
87
+ "transformer.h.14.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
88
+ "transformer.h.14.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
89
+ "transformer.h.14.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
90
+ "transformer.h.15.attn.c_attn.bias": "model-00001-of-00002.safetensors",
91
+ "transformer.h.15.attn.c_attn.weight": "model-00001-of-00002.safetensors",
92
+ "transformer.h.15.attn.c_proj.bias": "model-00001-of-00002.safetensors",
93
+ "transformer.h.15.attn.c_proj.weight": "model-00001-of-00002.safetensors",
94
+ "transformer.h.15.ln_1.bias": "model-00001-of-00002.safetensors",
95
+ "transformer.h.15.ln_1.weight": "model-00001-of-00002.safetensors",
96
+ "transformer.h.15.ln_2.bias": "model-00001-of-00002.safetensors",
97
+ "transformer.h.15.ln_2.weight": "model-00001-of-00002.safetensors",
98
+ "transformer.h.15.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
99
+ "transformer.h.15.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
100
+ "transformer.h.15.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
101
+ "transformer.h.15.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
102
+ "transformer.h.16.attn.c_attn.bias": "model-00001-of-00002.safetensors",
103
+ "transformer.h.16.attn.c_attn.weight": "model-00001-of-00002.safetensors",
104
+ "transformer.h.16.attn.c_proj.bias": "model-00001-of-00002.safetensors",
105
+ "transformer.h.16.attn.c_proj.weight": "model-00001-of-00002.safetensors",
106
+ "transformer.h.16.ln_1.bias": "model-00001-of-00002.safetensors",
107
+ "transformer.h.16.ln_1.weight": "model-00001-of-00002.safetensors",
108
+ "transformer.h.16.ln_2.bias": "model-00001-of-00002.safetensors",
109
+ "transformer.h.16.ln_2.weight": "model-00001-of-00002.safetensors",
110
+ "transformer.h.16.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
111
+ "transformer.h.16.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
112
+ "transformer.h.16.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
113
+ "transformer.h.16.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
114
+ "transformer.h.17.attn.c_attn.bias": "model-00001-of-00002.safetensors",
115
+ "transformer.h.17.attn.c_attn.weight": "model-00001-of-00002.safetensors",
116
+ "transformer.h.17.attn.c_proj.bias": "model-00001-of-00002.safetensors",
117
+ "transformer.h.17.attn.c_proj.weight": "model-00001-of-00002.safetensors",
118
+ "transformer.h.17.ln_1.bias": "model-00001-of-00002.safetensors",
119
+ "transformer.h.17.ln_1.weight": "model-00001-of-00002.safetensors",
120
+ "transformer.h.17.ln_2.bias": "model-00001-of-00002.safetensors",
121
+ "transformer.h.17.ln_2.weight": "model-00001-of-00002.safetensors",
122
+ "transformer.h.17.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
123
+ "transformer.h.17.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
124
+ "transformer.h.17.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
125
+ "transformer.h.17.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
126
+ "transformer.h.18.attn.c_attn.bias": "model-00001-of-00002.safetensors",
127
+ "transformer.h.18.attn.c_attn.weight": "model-00001-of-00002.safetensors",
128
+ "transformer.h.18.attn.c_proj.bias": "model-00001-of-00002.safetensors",
129
+ "transformer.h.18.attn.c_proj.weight": "model-00001-of-00002.safetensors",
130
+ "transformer.h.18.ln_1.bias": "model-00001-of-00002.safetensors",
131
+ "transformer.h.18.ln_1.weight": "model-00001-of-00002.safetensors",
132
+ "transformer.h.18.ln_2.bias": "model-00001-of-00002.safetensors",
133
+ "transformer.h.18.ln_2.weight": "model-00001-of-00002.safetensors",
134
+ "transformer.h.18.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
135
+ "transformer.h.18.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
136
+ "transformer.h.18.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
137
+ "transformer.h.18.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
138
+ "transformer.h.19.attn.c_attn.bias": "model-00001-of-00002.safetensors",
139
+ "transformer.h.19.attn.c_attn.weight": "model-00001-of-00002.safetensors",
140
+ "transformer.h.19.attn.c_proj.bias": "model-00001-of-00002.safetensors",
141
+ "transformer.h.19.attn.c_proj.weight": "model-00001-of-00002.safetensors",
142
+ "transformer.h.19.ln_1.bias": "model-00001-of-00002.safetensors",
143
+ "transformer.h.19.ln_1.weight": "model-00001-of-00002.safetensors",
144
+ "transformer.h.19.ln_2.bias": "model-00001-of-00002.safetensors",
145
+ "transformer.h.19.ln_2.weight": "model-00001-of-00002.safetensors",
146
+ "transformer.h.19.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
147
+ "transformer.h.19.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
148
+ "transformer.h.19.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
149
+ "transformer.h.19.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
150
+ "transformer.h.2.attn.c_attn.bias": "model-00001-of-00002.safetensors",
151
+ "transformer.h.2.attn.c_attn.weight": "model-00001-of-00002.safetensors",
152
+ "transformer.h.2.attn.c_proj.bias": "model-00001-of-00002.safetensors",
153
+ "transformer.h.2.attn.c_proj.weight": "model-00001-of-00002.safetensors",
154
+ "transformer.h.2.ln_1.bias": "model-00001-of-00002.safetensors",
155
+ "transformer.h.2.ln_1.weight": "model-00001-of-00002.safetensors",
156
+ "transformer.h.2.ln_2.bias": "model-00001-of-00002.safetensors",
157
+ "transformer.h.2.ln_2.weight": "model-00001-of-00002.safetensors",
158
+ "transformer.h.2.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
159
+ "transformer.h.2.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
160
+ "transformer.h.2.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
161
+ "transformer.h.2.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
162
+ "transformer.h.20.attn.c_attn.bias": "model-00001-of-00002.safetensors",
163
+ "transformer.h.20.attn.c_attn.weight": "model-00001-of-00002.safetensors",
164
+ "transformer.h.20.attn.c_proj.bias": "model-00001-of-00002.safetensors",
165
+ "transformer.h.20.attn.c_proj.weight": "model-00001-of-00002.safetensors",
166
+ "transformer.h.20.ln_1.bias": "model-00001-of-00002.safetensors",
167
+ "transformer.h.20.ln_1.weight": "model-00001-of-00002.safetensors",
168
+ "transformer.h.20.ln_2.bias": "model-00001-of-00002.safetensors",
169
+ "transformer.h.20.ln_2.weight": "model-00001-of-00002.safetensors",
170
+ "transformer.h.20.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
171
+ "transformer.h.20.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
172
+ "transformer.h.20.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
173
+ "transformer.h.20.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
174
+ "transformer.h.21.attn.c_attn.bias": "model-00001-of-00002.safetensors",
175
+ "transformer.h.21.attn.c_attn.weight": "model-00001-of-00002.safetensors",
176
+ "transformer.h.21.attn.c_proj.bias": "model-00001-of-00002.safetensors",
177
+ "transformer.h.21.attn.c_proj.weight": "model-00001-of-00002.safetensors",
178
+ "transformer.h.21.ln_1.bias": "model-00001-of-00002.safetensors",
179
+ "transformer.h.21.ln_1.weight": "model-00001-of-00002.safetensors",
180
+ "transformer.h.21.ln_2.bias": "model-00001-of-00002.safetensors",
181
+ "transformer.h.21.ln_2.weight": "model-00001-of-00002.safetensors",
182
+ "transformer.h.21.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
183
+ "transformer.h.21.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
184
+ "transformer.h.21.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
185
+ "transformer.h.21.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
186
+ "transformer.h.22.attn.c_attn.bias": "model-00001-of-00002.safetensors",
187
+ "transformer.h.22.attn.c_attn.weight": "model-00001-of-00002.safetensors",
188
+ "transformer.h.22.attn.c_proj.bias": "model-00001-of-00002.safetensors",
189
+ "transformer.h.22.attn.c_proj.weight": "model-00001-of-00002.safetensors",
190
+ "transformer.h.22.ln_1.bias": "model-00001-of-00002.safetensors",
191
+ "transformer.h.22.ln_1.weight": "model-00001-of-00002.safetensors",
192
+ "transformer.h.22.ln_2.bias": "model-00001-of-00002.safetensors",
193
+ "transformer.h.22.ln_2.weight": "model-00001-of-00002.safetensors",
194
+ "transformer.h.22.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
195
+ "transformer.h.22.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
196
+ "transformer.h.22.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
197
+ "transformer.h.22.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
198
+ "transformer.h.23.attn.c_attn.bias": "model-00001-of-00002.safetensors",
199
+ "transformer.h.23.attn.c_attn.weight": "model-00001-of-00002.safetensors",
200
+ "transformer.h.23.attn.c_proj.bias": "model-00001-of-00002.safetensors",
201
+ "transformer.h.23.attn.c_proj.weight": "model-00001-of-00002.safetensors",
202
+ "transformer.h.23.ln_1.bias": "model-00001-of-00002.safetensors",
203
+ "transformer.h.23.ln_1.weight": "model-00001-of-00002.safetensors",
204
+ "transformer.h.23.ln_2.bias": "model-00001-of-00002.safetensors",
205
+ "transformer.h.23.ln_2.weight": "model-00001-of-00002.safetensors",
206
+ "transformer.h.23.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
207
+ "transformer.h.23.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
208
+ "transformer.h.23.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
209
+ "transformer.h.23.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
210
+ "transformer.h.24.attn.c_attn.bias": "model-00001-of-00002.safetensors",
211
+ "transformer.h.24.attn.c_attn.weight": "model-00001-of-00002.safetensors",
212
+ "transformer.h.24.attn.c_proj.bias": "model-00001-of-00002.safetensors",
213
+ "transformer.h.24.attn.c_proj.weight": "model-00001-of-00002.safetensors",
214
+ "transformer.h.24.ln_1.bias": "model-00001-of-00002.safetensors",
215
+ "transformer.h.24.ln_1.weight": "model-00001-of-00002.safetensors",
216
+ "transformer.h.24.ln_2.bias": "model-00001-of-00002.safetensors",
217
+ "transformer.h.24.ln_2.weight": "model-00001-of-00002.safetensors",
218
+ "transformer.h.24.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
219
+ "transformer.h.24.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
220
+ "transformer.h.24.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
221
+ "transformer.h.24.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
222
+ "transformer.h.25.attn.c_attn.bias": "model-00001-of-00002.safetensors",
223
+ "transformer.h.25.attn.c_attn.weight": "model-00001-of-00002.safetensors",
224
+ "transformer.h.25.attn.c_proj.bias": "model-00001-of-00002.safetensors",
225
+ "transformer.h.25.attn.c_proj.weight": "model-00001-of-00002.safetensors",
226
+ "transformer.h.25.ln_1.bias": "model-00001-of-00002.safetensors",
227
+ "transformer.h.25.ln_1.weight": "model-00001-of-00002.safetensors",
228
+ "transformer.h.25.ln_2.bias": "model-00001-of-00002.safetensors",
229
+ "transformer.h.25.ln_2.weight": "model-00001-of-00002.safetensors",
230
+ "transformer.h.25.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
231
+ "transformer.h.25.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
232
+ "transformer.h.25.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
233
+ "transformer.h.25.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
234
+ "transformer.h.26.attn.c_attn.bias": "model-00001-of-00002.safetensors",
235
+ "transformer.h.26.attn.c_attn.weight": "model-00001-of-00002.safetensors",
236
+ "transformer.h.26.attn.c_proj.bias": "model-00001-of-00002.safetensors",
237
+ "transformer.h.26.attn.c_proj.weight": "model-00001-of-00002.safetensors",
238
+ "transformer.h.26.ln_1.bias": "model-00001-of-00002.safetensors",
239
+ "transformer.h.26.ln_1.weight": "model-00001-of-00002.safetensors",
240
+ "transformer.h.26.ln_2.bias": "model-00001-of-00002.safetensors",
241
+ "transformer.h.26.ln_2.weight": "model-00001-of-00002.safetensors",
242
+ "transformer.h.26.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
243
+ "transformer.h.26.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
244
+ "transformer.h.26.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
245
+ "transformer.h.26.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
246
+ "transformer.h.27.attn.c_attn.bias": "model-00001-of-00002.safetensors",
247
+ "transformer.h.27.attn.c_attn.weight": "model-00001-of-00002.safetensors",
248
+ "transformer.h.27.attn.c_proj.bias": "model-00001-of-00002.safetensors",
249
+ "transformer.h.27.attn.c_proj.weight": "model-00001-of-00002.safetensors",
250
+ "transformer.h.27.ln_1.bias": "model-00001-of-00002.safetensors",
251
+ "transformer.h.27.ln_1.weight": "model-00001-of-00002.safetensors",
252
+ "transformer.h.27.ln_2.bias": "model-00001-of-00002.safetensors",
253
+ "transformer.h.27.ln_2.weight": "model-00001-of-00002.safetensors",
254
+ "transformer.h.27.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
255
+ "transformer.h.27.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
256
+ "transformer.h.27.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
257
+ "transformer.h.27.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
258
+ "transformer.h.28.attn.c_attn.bias": "model-00001-of-00002.safetensors",
259
+ "transformer.h.28.attn.c_attn.weight": "model-00001-of-00002.safetensors",
260
+ "transformer.h.28.attn.c_proj.bias": "model-00001-of-00002.safetensors",
261
+ "transformer.h.28.attn.c_proj.weight": "model-00001-of-00002.safetensors",
262
+ "transformer.h.28.ln_1.bias": "model-00001-of-00002.safetensors",
263
+ "transformer.h.28.ln_1.weight": "model-00001-of-00002.safetensors",
264
+ "transformer.h.28.ln_2.bias": "model-00001-of-00002.safetensors",
265
+ "transformer.h.28.ln_2.weight": "model-00001-of-00002.safetensors",
266
+ "transformer.h.28.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
267
+ "transformer.h.28.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
268
+ "transformer.h.28.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
269
+ "transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
270
+ "transformer.h.29.attn.c_attn.bias": "model-00001-of-00002.safetensors",
271
+ "transformer.h.29.attn.c_attn.weight": "model-00001-of-00002.safetensors",
272
+ "transformer.h.29.attn.c_proj.bias": "model-00001-of-00002.safetensors",
273
+ "transformer.h.29.attn.c_proj.weight": "model-00001-of-00002.safetensors",
274
+ "transformer.h.29.ln_1.bias": "model-00001-of-00002.safetensors",
275
+ "transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
276
+ "transformer.h.29.ln_2.bias": "model-00001-of-00002.safetensors",
277
+ "transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
278
+ "transformer.h.29.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
279
+ "transformer.h.29.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
280
+ "transformer.h.29.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
281
+ "transformer.h.29.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
282
+ "transformer.h.3.attn.c_attn.bias": "model-00001-of-00002.safetensors",
283
+ "transformer.h.3.attn.c_attn.weight": "model-00001-of-00002.safetensors",
284
+ "transformer.h.3.attn.c_proj.bias": "model-00001-of-00002.safetensors",
285
+ "transformer.h.3.attn.c_proj.weight": "model-00001-of-00002.safetensors",
286
+ "transformer.h.3.ln_1.bias": "model-00001-of-00002.safetensors",
287
+ "transformer.h.3.ln_1.weight": "model-00001-of-00002.safetensors",
288
+ "transformer.h.3.ln_2.bias": "model-00001-of-00002.safetensors",
289
+ "transformer.h.3.ln_2.weight": "model-00001-of-00002.safetensors",
290
+ "transformer.h.3.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
291
+ "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
292
+ "transformer.h.3.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
293
+ "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
294
+ "transformer.h.30.attn.c_attn.bias": "model-00001-of-00002.safetensors",
295
+ "transformer.h.30.attn.c_attn.weight": "model-00001-of-00002.safetensors",
296
+ "transformer.h.30.attn.c_proj.bias": "model-00001-of-00002.safetensors",
297
+ "transformer.h.30.attn.c_proj.weight": "model-00001-of-00002.safetensors",
298
+ "transformer.h.30.ln_1.bias": "model-00001-of-00002.safetensors",
299
+ "transformer.h.30.ln_1.weight": "model-00001-of-00002.safetensors",
300
+ "transformer.h.30.ln_2.bias": "model-00001-of-00002.safetensors",
301
+ "transformer.h.30.ln_2.weight": "model-00001-of-00002.safetensors",
302
+ "transformer.h.30.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
303
+ "transformer.h.30.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
304
+ "transformer.h.30.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
305
+ "transformer.h.30.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
306
+ "transformer.h.31.attn.c_attn.bias": "model-00001-of-00002.safetensors",
307
+ "transformer.h.31.attn.c_attn.weight": "model-00001-of-00002.safetensors",
308
+ "transformer.h.31.attn.c_proj.bias": "model-00001-of-00002.safetensors",
309
+ "transformer.h.31.attn.c_proj.weight": "model-00001-of-00002.safetensors",
310
+ "transformer.h.31.ln_1.bias": "model-00001-of-00002.safetensors",
311
+ "transformer.h.31.ln_1.weight": "model-00001-of-00002.safetensors",
312
+ "transformer.h.31.ln_2.bias": "model-00001-of-00002.safetensors",
313
+ "transformer.h.31.ln_2.weight": "model-00001-of-00002.safetensors",
314
+ "transformer.h.31.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
315
+ "transformer.h.31.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
316
+ "transformer.h.31.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
317
+ "transformer.h.31.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
318
+ "transformer.h.32.attn.c_attn.bias": "model-00001-of-00002.safetensors",
319
+ "transformer.h.32.attn.c_attn.weight": "model-00001-of-00002.safetensors",
320
+ "transformer.h.32.attn.c_proj.bias": "model-00001-of-00002.safetensors",
321
+ "transformer.h.32.attn.c_proj.weight": "model-00001-of-00002.safetensors",
322
+ "transformer.h.32.ln_1.bias": "model-00001-of-00002.safetensors",
323
+ "transformer.h.32.ln_1.weight": "model-00001-of-00002.safetensors",
324
+ "transformer.h.32.ln_2.bias": "model-00001-of-00002.safetensors",
325
+ "transformer.h.32.ln_2.weight": "model-00001-of-00002.safetensors",
326
+ "transformer.h.32.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
327
+ "transformer.h.32.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
328
+ "transformer.h.32.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
329
+ "transformer.h.32.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
330
+ "transformer.h.33.attn.c_attn.bias": "model-00001-of-00002.safetensors",
331
+ "transformer.h.33.attn.c_attn.weight": "model-00001-of-00002.safetensors",
332
+ "transformer.h.33.attn.c_proj.bias": "model-00001-of-00002.safetensors",
333
+ "transformer.h.33.attn.c_proj.weight": "model-00001-of-00002.safetensors",
334
+ "transformer.h.33.ln_1.bias": "model-00001-of-00002.safetensors",
335
+ "transformer.h.33.ln_1.weight": "model-00001-of-00002.safetensors",
336
+ "transformer.h.33.ln_2.bias": "model-00001-of-00002.safetensors",
337
+ "transformer.h.33.ln_2.weight": "model-00001-of-00002.safetensors",
338
+ "transformer.h.33.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
339
+ "transformer.h.33.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
340
+ "transformer.h.33.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
341
+ "transformer.h.33.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
342
+ "transformer.h.34.attn.c_attn.bias": "model-00001-of-00002.safetensors",
343
+ "transformer.h.34.attn.c_attn.weight": "model-00001-of-00002.safetensors",
344
+ "transformer.h.34.attn.c_proj.bias": "model-00001-of-00002.safetensors",
345
+ "transformer.h.34.attn.c_proj.weight": "model-00001-of-00002.safetensors",
346
+ "transformer.h.34.ln_1.bias": "model-00001-of-00002.safetensors",
347
+ "transformer.h.34.ln_1.weight": "model-00001-of-00002.safetensors",
348
+ "transformer.h.34.ln_2.bias": "model-00001-of-00002.safetensors",
349
+ "transformer.h.34.ln_2.weight": "model-00001-of-00002.safetensors",
350
+ "transformer.h.34.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
351
+ "transformer.h.34.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
352
+ "transformer.h.34.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
353
+ "transformer.h.34.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
354
+ "transformer.h.35.attn.c_attn.bias": "model-00001-of-00002.safetensors",
355
+ "transformer.h.35.attn.c_attn.weight": "model-00001-of-00002.safetensors",
356
+ "transformer.h.35.attn.c_proj.bias": "model-00001-of-00002.safetensors",
357
+ "transformer.h.35.attn.c_proj.weight": "model-00001-of-00002.safetensors",
358
+ "transformer.h.35.ln_1.bias": "model-00001-of-00002.safetensors",
359
+ "transformer.h.35.ln_1.weight": "model-00001-of-00002.safetensors",
360
+ "transformer.h.35.ln_2.bias": "model-00001-of-00002.safetensors",
361
+ "transformer.h.35.ln_2.weight": "model-00001-of-00002.safetensors",
362
+ "transformer.h.35.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
363
+ "transformer.h.35.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
364
+ "transformer.h.35.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
365
+ "transformer.h.35.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
366
+ "transformer.h.36.attn.c_attn.bias": "model-00001-of-00002.safetensors",
367
+ "transformer.h.36.attn.c_attn.weight": "model-00001-of-00002.safetensors",
368
+ "transformer.h.36.attn.c_proj.bias": "model-00001-of-00002.safetensors",
369
+ "transformer.h.36.attn.c_proj.weight": "model-00001-of-00002.safetensors",
370
+ "transformer.h.36.ln_1.bias": "model-00001-of-00002.safetensors",
371
+ "transformer.h.36.ln_1.weight": "model-00001-of-00002.safetensors",
372
+ "transformer.h.36.ln_2.bias": "model-00001-of-00002.safetensors",
373
+ "transformer.h.36.ln_2.weight": "model-00001-of-00002.safetensors",
374
+ "transformer.h.36.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
375
+ "transformer.h.36.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
376
+ "transformer.h.36.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
377
+ "transformer.h.36.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
378
+ "transformer.h.37.attn.c_attn.bias": "model-00001-of-00002.safetensors",
379
+ "transformer.h.37.attn.c_attn.weight": "model-00001-of-00002.safetensors",
380
+ "transformer.h.37.attn.c_proj.bias": "model-00001-of-00002.safetensors",
381
+ "transformer.h.37.attn.c_proj.weight": "model-00001-of-00002.safetensors",
382
+ "transformer.h.37.ln_1.bias": "model-00001-of-00002.safetensors",
383
+ "transformer.h.37.ln_1.weight": "model-00001-of-00002.safetensors",
384
+ "transformer.h.37.ln_2.bias": "model-00001-of-00002.safetensors",
385
+ "transformer.h.37.ln_2.weight": "model-00001-of-00002.safetensors",
386
+ "transformer.h.37.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
387
+ "transformer.h.37.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
388
+ "transformer.h.37.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
389
+ "transformer.h.37.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
390
+ "transformer.h.38.attn.c_attn.bias": "model-00002-of-00002.safetensors",
391
+ "transformer.h.38.attn.c_attn.weight": "model-00002-of-00002.safetensors",
392
+ "transformer.h.38.attn.c_proj.bias": "model-00002-of-00002.safetensors",
393
+ "transformer.h.38.attn.c_proj.weight": "model-00002-of-00002.safetensors",
394
+ "transformer.h.38.ln_1.bias": "model-00002-of-00002.safetensors",
395
+ "transformer.h.38.ln_1.weight": "model-00002-of-00002.safetensors",
396
+ "transformer.h.38.ln_2.bias": "model-00002-of-00002.safetensors",
397
+ "transformer.h.38.ln_2.weight": "model-00002-of-00002.safetensors",
398
+ "transformer.h.38.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
399
+ "transformer.h.38.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
400
+ "transformer.h.38.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
401
+ "transformer.h.38.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
402
+ "transformer.h.39.attn.c_attn.bias": "model-00002-of-00002.safetensors",
403
+ "transformer.h.39.attn.c_attn.weight": "model-00002-of-00002.safetensors",
404
+ "transformer.h.39.attn.c_proj.bias": "model-00002-of-00002.safetensors",
405
+ "transformer.h.39.attn.c_proj.weight": "model-00002-of-00002.safetensors",
406
+ "transformer.h.39.ln_1.bias": "model-00002-of-00002.safetensors",
407
+ "transformer.h.39.ln_1.weight": "model-00002-of-00002.safetensors",
408
+ "transformer.h.39.ln_2.bias": "model-00002-of-00002.safetensors",
409
+ "transformer.h.39.ln_2.weight": "model-00002-of-00002.safetensors",
410
+ "transformer.h.39.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
411
+ "transformer.h.39.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
412
+ "transformer.h.39.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
413
+ "transformer.h.39.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
414
+ "transformer.h.4.attn.c_attn.bias": "model-00001-of-00002.safetensors",
415
+ "transformer.h.4.attn.c_attn.weight": "model-00001-of-00002.safetensors",
416
+ "transformer.h.4.attn.c_proj.bias": "model-00001-of-00002.safetensors",
417
+ "transformer.h.4.attn.c_proj.weight": "model-00001-of-00002.safetensors",
418
+ "transformer.h.4.ln_1.bias": "model-00001-of-00002.safetensors",
419
+ "transformer.h.4.ln_1.weight": "model-00001-of-00002.safetensors",
420
+ "transformer.h.4.ln_2.bias": "model-00001-of-00002.safetensors",
421
+ "transformer.h.4.ln_2.weight": "model-00001-of-00002.safetensors",
422
+ "transformer.h.4.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
423
+ "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
424
+ "transformer.h.4.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
425
+ "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
426
+ "transformer.h.40.attn.c_attn.bias": "model-00002-of-00002.safetensors",
427
+ "transformer.h.40.attn.c_attn.weight": "model-00002-of-00002.safetensors",
428
+ "transformer.h.40.attn.c_proj.bias": "model-00002-of-00002.safetensors",
429
+ "transformer.h.40.attn.c_proj.weight": "model-00002-of-00002.safetensors",
430
+ "transformer.h.40.ln_1.bias": "model-00002-of-00002.safetensors",
431
+ "transformer.h.40.ln_1.weight": "model-00002-of-00002.safetensors",
432
+ "transformer.h.40.ln_2.bias": "model-00002-of-00002.safetensors",
433
+ "transformer.h.40.ln_2.weight": "model-00002-of-00002.safetensors",
434
+ "transformer.h.40.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
435
+ "transformer.h.40.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
436
+ "transformer.h.40.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
437
+ "transformer.h.40.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
438
+ "transformer.h.41.attn.c_attn.bias": "model-00002-of-00002.safetensors",
439
+ "transformer.h.41.attn.c_attn.weight": "model-00002-of-00002.safetensors",
440
+ "transformer.h.41.attn.c_proj.bias": "model-00002-of-00002.safetensors",
441
+ "transformer.h.41.attn.c_proj.weight": "model-00002-of-00002.safetensors",
442
+ "transformer.h.41.ln_1.bias": "model-00002-of-00002.safetensors",
443
+ "transformer.h.41.ln_1.weight": "model-00002-of-00002.safetensors",
444
+ "transformer.h.41.ln_2.bias": "model-00002-of-00002.safetensors",
445
+ "transformer.h.41.ln_2.weight": "model-00002-of-00002.safetensors",
446
+ "transformer.h.41.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
447
+ "transformer.h.41.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
448
+ "transformer.h.41.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
449
+ "transformer.h.41.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
450
+ "transformer.h.42.attn.c_attn.bias": "model-00002-of-00002.safetensors",
451
+ "transformer.h.42.attn.c_attn.weight": "model-00002-of-00002.safetensors",
452
+ "transformer.h.42.attn.c_proj.bias": "model-00002-of-00002.safetensors",
453
+ "transformer.h.42.attn.c_proj.weight": "model-00002-of-00002.safetensors",
454
+ "transformer.h.42.ln_1.bias": "model-00002-of-00002.safetensors",
455
+ "transformer.h.42.ln_1.weight": "model-00002-of-00002.safetensors",
456
+ "transformer.h.42.ln_2.bias": "model-00002-of-00002.safetensors",
457
+ "transformer.h.42.ln_2.weight": "model-00002-of-00002.safetensors",
458
+ "transformer.h.42.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
459
+ "transformer.h.42.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
460
+ "transformer.h.42.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
461
+ "transformer.h.42.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
462
+ "transformer.h.43.attn.c_attn.bias": "model-00002-of-00002.safetensors",
463
+ "transformer.h.43.attn.c_attn.weight": "model-00002-of-00002.safetensors",
464
+ "transformer.h.43.attn.c_proj.bias": "model-00002-of-00002.safetensors",
465
+ "transformer.h.43.attn.c_proj.weight": "model-00002-of-00002.safetensors",
466
+ "transformer.h.43.ln_1.bias": "model-00002-of-00002.safetensors",
467
+ "transformer.h.43.ln_1.weight": "model-00002-of-00002.safetensors",
468
+ "transformer.h.43.ln_2.bias": "model-00002-of-00002.safetensors",
469
+ "transformer.h.43.ln_2.weight": "model-00002-of-00002.safetensors",
470
+ "transformer.h.43.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
471
+ "transformer.h.43.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
472
+ "transformer.h.43.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
473
+ "transformer.h.43.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
474
+ "transformer.h.44.attn.c_attn.bias": "model-00002-of-00002.safetensors",
475
+ "transformer.h.44.attn.c_attn.weight": "model-00002-of-00002.safetensors",
476
+ "transformer.h.44.attn.c_proj.bias": "model-00002-of-00002.safetensors",
477
+ "transformer.h.44.attn.c_proj.weight": "model-00002-of-00002.safetensors",
478
+ "transformer.h.44.ln_1.bias": "model-00002-of-00002.safetensors",
479
+ "transformer.h.44.ln_1.weight": "model-00002-of-00002.safetensors",
480
+ "transformer.h.44.ln_2.bias": "model-00002-of-00002.safetensors",
481
+ "transformer.h.44.ln_2.weight": "model-00002-of-00002.safetensors",
482
+ "transformer.h.44.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
483
+ "transformer.h.44.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
484
+ "transformer.h.44.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
485
+ "transformer.h.44.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
486
+ "transformer.h.45.attn.c_attn.bias": "model-00002-of-00002.safetensors",
487
+ "transformer.h.45.attn.c_attn.weight": "model-00002-of-00002.safetensors",
488
+ "transformer.h.45.attn.c_proj.bias": "model-00002-of-00002.safetensors",
489
+ "transformer.h.45.attn.c_proj.weight": "model-00002-of-00002.safetensors",
490
+ "transformer.h.45.ln_1.bias": "model-00002-of-00002.safetensors",
491
+ "transformer.h.45.ln_1.weight": "model-00002-of-00002.safetensors",
492
+ "transformer.h.45.ln_2.bias": "model-00002-of-00002.safetensors",
493
+ "transformer.h.45.ln_2.weight": "model-00002-of-00002.safetensors",
494
+ "transformer.h.45.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
495
+ "transformer.h.45.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
496
+ "transformer.h.45.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
497
+ "transformer.h.45.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
498
+ "transformer.h.46.attn.c_attn.bias": "model-00002-of-00002.safetensors",
499
+ "transformer.h.46.attn.c_attn.weight": "model-00002-of-00002.safetensors",
500
+ "transformer.h.46.attn.c_proj.bias": "model-00002-of-00002.safetensors",
501
+ "transformer.h.46.attn.c_proj.weight": "model-00002-of-00002.safetensors",
502
+ "transformer.h.46.ln_1.bias": "model-00002-of-00002.safetensors",
503
+ "transformer.h.46.ln_1.weight": "model-00002-of-00002.safetensors",
504
+ "transformer.h.46.ln_2.bias": "model-00002-of-00002.safetensors",
505
+ "transformer.h.46.ln_2.weight": "model-00002-of-00002.safetensors",
506
+ "transformer.h.46.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
507
+ "transformer.h.46.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
508
+ "transformer.h.46.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
509
+ "transformer.h.46.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
510
+ "transformer.h.47.attn.c_attn.bias": "model-00002-of-00002.safetensors",
511
+ "transformer.h.47.attn.c_attn.weight": "model-00002-of-00002.safetensors",
512
+ "transformer.h.47.attn.c_proj.bias": "model-00002-of-00002.safetensors",
513
+ "transformer.h.47.attn.c_proj.weight": "model-00002-of-00002.safetensors",
514
+ "transformer.h.47.ln_1.bias": "model-00002-of-00002.safetensors",
515
+ "transformer.h.47.ln_1.weight": "model-00002-of-00002.safetensors",
516
+ "transformer.h.47.ln_2.bias": "model-00002-of-00002.safetensors",
517
+ "transformer.h.47.ln_2.weight": "model-00002-of-00002.safetensors",
518
+ "transformer.h.47.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
519
+ "transformer.h.47.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
520
+ "transformer.h.47.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
521
+ "transformer.h.47.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
522
+ "transformer.h.5.attn.c_attn.bias": "model-00001-of-00002.safetensors",
523
+ "transformer.h.5.attn.c_attn.weight": "model-00001-of-00002.safetensors",
524
+ "transformer.h.5.attn.c_proj.bias": "model-00001-of-00002.safetensors",
525
+ "transformer.h.5.attn.c_proj.weight": "model-00001-of-00002.safetensors",
526
+ "transformer.h.5.ln_1.bias": "model-00001-of-00002.safetensors",
527
+ "transformer.h.5.ln_1.weight": "model-00001-of-00002.safetensors",
528
+ "transformer.h.5.ln_2.bias": "model-00001-of-00002.safetensors",
529
+ "transformer.h.5.ln_2.weight": "model-00001-of-00002.safetensors",
530
+ "transformer.h.5.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
531
+ "transformer.h.5.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
532
+ "transformer.h.5.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
533
+ "transformer.h.5.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
534
+ "transformer.h.6.attn.c_attn.bias": "model-00001-of-00002.safetensors",
535
+ "transformer.h.6.attn.c_attn.weight": "model-00001-of-00002.safetensors",
536
+ "transformer.h.6.attn.c_proj.bias": "model-00001-of-00002.safetensors",
537
+ "transformer.h.6.attn.c_proj.weight": "model-00001-of-00002.safetensors",
538
+ "transformer.h.6.ln_1.bias": "model-00001-of-00002.safetensors",
539
+ "transformer.h.6.ln_1.weight": "model-00001-of-00002.safetensors",
540
+ "transformer.h.6.ln_2.bias": "model-00001-of-00002.safetensors",
541
+ "transformer.h.6.ln_2.weight": "model-00001-of-00002.safetensors",
542
+ "transformer.h.6.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
543
+ "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
544
+ "transformer.h.6.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
545
+ "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
546
+ "transformer.h.7.attn.c_attn.bias": "model-00001-of-00002.safetensors",
547
+ "transformer.h.7.attn.c_attn.weight": "model-00001-of-00002.safetensors",
548
+ "transformer.h.7.attn.c_proj.bias": "model-00001-of-00002.safetensors",
549
+ "transformer.h.7.attn.c_proj.weight": "model-00001-of-00002.safetensors",
550
+ "transformer.h.7.ln_1.bias": "model-00001-of-00002.safetensors",
551
+ "transformer.h.7.ln_1.weight": "model-00001-of-00002.safetensors",
552
+ "transformer.h.7.ln_2.bias": "model-00001-of-00002.safetensors",
553
+ "transformer.h.7.ln_2.weight": "model-00001-of-00002.safetensors",
554
+ "transformer.h.7.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
555
+ "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
556
+ "transformer.h.7.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
557
+ "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
558
+ "transformer.h.8.attn.c_attn.bias": "model-00001-of-00002.safetensors",
559
+ "transformer.h.8.attn.c_attn.weight": "model-00001-of-00002.safetensors",
560
+ "transformer.h.8.attn.c_proj.bias": "model-00001-of-00002.safetensors",
561
+ "transformer.h.8.attn.c_proj.weight": "model-00001-of-00002.safetensors",
562
+ "transformer.h.8.ln_1.bias": "model-00001-of-00002.safetensors",
563
+ "transformer.h.8.ln_1.weight": "model-00001-of-00002.safetensors",
564
+ "transformer.h.8.ln_2.bias": "model-00001-of-00002.safetensors",
565
+ "transformer.h.8.ln_2.weight": "model-00001-of-00002.safetensors",
566
+ "transformer.h.8.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
567
+ "transformer.h.8.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
568
+ "transformer.h.8.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
569
+ "transformer.h.8.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
570
+ "transformer.h.9.attn.c_attn.bias": "model-00001-of-00002.safetensors",
571
+ "transformer.h.9.attn.c_attn.weight": "model-00001-of-00002.safetensors",
572
+ "transformer.h.9.attn.c_proj.bias": "model-00001-of-00002.safetensors",
573
+ "transformer.h.9.attn.c_proj.weight": "model-00001-of-00002.safetensors",
574
+ "transformer.h.9.ln_1.bias": "model-00001-of-00002.safetensors",
575
+ "transformer.h.9.ln_1.weight": "model-00001-of-00002.safetensors",
576
+ "transformer.h.9.ln_2.bias": "model-00001-of-00002.safetensors",
577
+ "transformer.h.9.ln_2.weight": "model-00001-of-00002.safetensors",
578
+ "transformer.h.9.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
579
+ "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
580
+ "transformer.h.9.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
581
+ "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
582
+ "transformer.ln_f.bias": "model-00002-of-00002.safetensors",
583
+ "transformer.ln_f.weight": "model-00002-of-00002.safetensors",
584
+ "transformer.wpe.weight": "model-00001-of-00002.safetensors",
585
+ "transformer.wte.weight": "model-00001-of-00002.safetensors"
586
+ }
587
+ }
epoch5/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
epoch5/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
epoch5/tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "padding_side": "left",
20
+ "tokenizer_class": "GPT2Tokenizer",
21
+ "unk_token": "<|endoftext|>"
22
+ }
epoch5/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14f3c6d299bfb369a2106aab54c59a032e03c366e1d1fecdcf02f954b66a25b
3
+ size 5624
epoch5/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8eb2dee99b44656f783f5efbf4a3bbc9dcd0dab175ed7174537bf6f3ce5196f5
3
  size 4959881464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60639cd96a2f1dcff024ce6dd400ad994e36c740ac7a907b91ebc7879bd87e64
3
  size 4959881464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cca01b2571e41c5fadb5e16768acf50f5ddde882e3a06a8b1e8c8dcea41040c6
3
  size 1270624096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba63f1f31de477c7bda8328c29edd9f0e7140e7e81adf8f6432e9a6eb8c04ea
3
  size 1270624096
wandb/run-20250402_145246-e1n3xkh6/files/output.log CHANGED
@@ -418,7 +418,108 @@ model-00001-of-00002.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 4.95G/4.9
418
  {'eval_loss': 0.8552775382995605, 'eval_runtime': 97.2843, 'eval_samples_per_second': 13.558, 'eval_steps_per_second': 0.853, 'eval_mean_token_accuracy': 0.8052615293537277, 'epoch': 4.0}
419
  model-00001-of-00002.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4.96G/4.96G [03:02<00:00, 27.2MB/s]
420
  Upload 5 LFS files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [03:02<00:00, 36.57s/it] :02<00:00, 37.4MB/s]
421
- model-00001-of-00002.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 4.94G/4.96G [02:08<00:00, 42.7MB/s]
422
  model-00002-of-00002.safetensors: 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1.26G/1.27G [00:44<00:00, 34.1MB/s]
423
 
424
  Upload 5 LFS files: 20%|β–ˆβ–ˆ | 1/5 [03:02<12:11, 182.85s/it]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  {'eval_loss': 0.8552775382995605, 'eval_runtime': 97.2843, 'eval_samples_per_second': 13.558, 'eval_steps_per_second': 0.853, 'eval_mean_token_accuracy': 0.8052615293537277, 'epoch': 4.0}
419
  model-00001-of-00002.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4.96G/4.96G [03:02<00:00, 27.2MB/s]
420
  Upload 5 LFS files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [03:02<00:00, 36.57s/it] :02<00:00, 37.4MB/s]
421
+ .96G [02:08<00:00, 42.7MB/s]
422
  model-00002-of-00002.safetensors: 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 1.26G/1.27G [00:44<00:00, 34.1MB/s]
423
 
424
  Upload 5 LFS files: 20%|β–ˆβ–ˆ | 1/5 [03:02<12:11, 182.85s/it]
425
+ {'loss': 0.153, 'grad_norm': 1.3796011209487915, 'learning_rate': 1.197860962566845e-05, 'mean_token_accuracy': 0.9557063996791839, 'epoch': 4.01}
426
+ {'loss': 0.1861, 'grad_norm': 2.260554075241089, 'learning_rate': 1.19572192513369e-05, 'mean_token_accuracy': 0.9432671725749969, 'epoch': 4.02}
427
+ {'loss': 0.175, 'grad_norm': 1.8162566423416138, 'learning_rate': 1.1935828877005349e-05, 'mean_token_accuracy': 0.9487885594367981, 'epoch': 4.03}
428
+ {'loss': 0.1647, 'grad_norm': 1.6924371719360352, 'learning_rate': 1.1914438502673798e-05, 'mean_token_accuracy': 0.9523689925670624, 'epoch': 4.04}
429
+ {'loss': 0.1692, 'grad_norm': 1.7612805366516113, 'learning_rate': 1.1893048128342247e-05, 'mean_token_accuracy': 0.9488636136054993, 'epoch': 4.05}
430
+ {'loss': 0.1805, 'grad_norm': 1.743569016456604, 'learning_rate': 1.1871657754010697e-05, 'mean_token_accuracy': 0.9443089008331299, 'epoch': 4.06}
431
+ {'loss': 0.1631, 'grad_norm': 1.5030018091201782, 'learning_rate': 1.1850267379679146e-05, 'mean_token_accuracy': 0.949979031085968, 'epoch': 4.07}
432
+ {'loss': 0.1564, 'grad_norm': 1.648834228515625, 'learning_rate': 1.1828877005347595e-05, 'mean_token_accuracy': 0.9512568712234497, 'epoch': 4.09}
433
+ {'loss': 0.1786, 'grad_norm': 1.85893714427948, 'learning_rate': 1.1807486631016042e-05, 'mean_token_accuracy': 0.9475835382938385, 'epoch': 4.1}
434
+ {'loss': 0.1775, 'grad_norm': 1.4474387168884277, 'learning_rate': 1.1786096256684495e-05, 'mean_token_accuracy': 0.946813315153122, 'epoch': 4.11}
435
+ {'loss': 0.1832, 'grad_norm': 2.0652883052825928, 'learning_rate': 1.1764705882352942e-05, 'mean_token_accuracy': 0.9461455881595612, 'epoch': 4.12}
436
+ {'loss': 0.1696, 'grad_norm': 2.3539726734161377, 'learning_rate': 1.174331550802139e-05, 'mean_token_accuracy': 0.9492765009403229, 'epoch': 4.13}
437
+ {'loss': 0.173, 'grad_norm': 1.7823604345321655, 'learning_rate': 1.172192513368984e-05, 'mean_token_accuracy': 0.9457501530647278, 'epoch': 4.14}
438
+ {'loss': 0.1907, 'grad_norm': 1.9060957431793213, 'learning_rate': 1.170053475935829e-05, 'mean_token_accuracy': 0.9392943561077118, 'epoch': 4.15}
439
+ {'loss': 0.1742, 'grad_norm': 1.4347060918807983, 'learning_rate': 1.167914438502674e-05, 'mean_token_accuracy': 0.9455202996730805, 'epoch': 4.16}
440
+ {'loss': 0.1791, 'grad_norm': 2.15783953666687, 'learning_rate': 1.1657754010695188e-05, 'mean_token_accuracy': 0.9481375277042389, 'epoch': 4.17}
441
+ {'loss': 0.1684, 'grad_norm': 1.7089780569076538, 'learning_rate': 1.1636363636363637e-05, 'mean_token_accuracy': 0.9490509748458862, 'epoch': 4.18}
442
+ {'loss': 0.1742, 'grad_norm': 1.9038798809051514, 'learning_rate': 1.1614973262032086e-05, 'mean_token_accuracy': 0.9479456424713135, 'epoch': 4.19}
443
+ {'loss': 0.1861, 'grad_norm': 1.9221370220184326, 'learning_rate': 1.1593582887700537e-05, 'mean_token_accuracy': 0.9414165675640106, 'epoch': 4.2}
444
+ {'loss': 0.1728, 'grad_norm': 1.7148997783660889, 'learning_rate': 1.1572192513368986e-05, 'mean_token_accuracy': 0.9447316288948059, 'epoch': 4.21}
445
+ {'loss': 0.1757, 'grad_norm': 1.7448234558105469, 'learning_rate': 1.1550802139037434e-05, 'mean_token_accuracy': 0.9497358977794648, 'epoch': 4.22}
446
+ {'loss': 0.1554, 'grad_norm': 1.57687246799469, 'learning_rate': 1.1529411764705882e-05, 'mean_token_accuracy': 0.9528406322002411, 'epoch': 4.24}
447
+ {'loss': 0.1737, 'grad_norm': 1.537142276763916, 'learning_rate': 1.1508021390374334e-05, 'mean_token_accuracy': 0.9442233860492706, 'epoch': 4.25}
448
+ {'loss': 0.1605, 'grad_norm': 1.6654634475708008, 'learning_rate': 1.1486631016042781e-05, 'mean_token_accuracy': 0.9521208226680755, 'epoch': 4.26}
449
+ {'loss': 0.1707, 'grad_norm': 1.2931082248687744, 'learning_rate': 1.146524064171123e-05, 'mean_token_accuracy': 0.9495856404304505, 'epoch': 4.27}
450
+ {'loss': 0.1737, 'grad_norm': 1.9283350706100464, 'learning_rate': 1.1443850267379679e-05, 'mean_token_accuracy': 0.9470351815223694, 'epoch': 4.28}
451
+ {'loss': 0.1606, 'grad_norm': 1.7414613962173462, 'learning_rate': 1.142245989304813e-05, 'mean_token_accuracy': 0.9515282690525055, 'epoch': 4.29}
452
+ {'loss': 0.1881, 'grad_norm': 1.5943981409072876, 'learning_rate': 1.1401069518716579e-05, 'mean_token_accuracy': 0.9437048673629761, 'epoch': 4.3}
453
+ {'loss': 0.1662, 'grad_norm': 1.9061501026153564, 'learning_rate': 1.1379679144385028e-05, 'mean_token_accuracy': 0.9491634905338288, 'epoch': 4.31}
454
+ {'loss': 0.1741, 'grad_norm': 1.934288740158081, 'learning_rate': 1.1358288770053476e-05, 'mean_token_accuracy': 0.9466652691364288, 'epoch': 4.32}
455
+ {'loss': 0.1528, 'grad_norm': 1.8484306335449219, 'learning_rate': 1.1336898395721927e-05, 'mean_token_accuracy': 0.9520830154418946, 'epoch': 4.33}
456
+ {'loss': 0.1696, 'grad_norm': 1.4194589853286743, 'learning_rate': 1.1315508021390376e-05, 'mean_token_accuracy': 0.9487707138061523, 'epoch': 4.34}
457
+ {'loss': 0.207, 'grad_norm': 1.764488697052002, 'learning_rate': 1.1294117647058825e-05, 'mean_token_accuracy': 0.9375714957714081, 'epoch': 4.35}
458
+ {'loss': 0.1674, 'grad_norm': 1.8727854490280151, 'learning_rate': 1.1272727272727272e-05, 'mean_token_accuracy': 0.9494613409042358, 'epoch': 4.36}
459
+ {'loss': 0.1898, 'grad_norm': 2.1858925819396973, 'learning_rate': 1.1251336898395724e-05, 'mean_token_accuracy': 0.9412040770053863, 'epoch': 4.37}
460
+ {'loss': 0.1818, 'grad_norm': 1.6867077350616455, 'learning_rate': 1.1229946524064172e-05, 'mean_token_accuracy': 0.9427337288856507, 'epoch': 4.39}
461
+ {'loss': 0.1863, 'grad_norm': 1.748297929763794, 'learning_rate': 1.120855614973262e-05, 'mean_token_accuracy': 0.9438842356204986, 'epoch': 4.4}
462
+ {'loss': 0.1521, 'grad_norm': 1.87770676612854, 'learning_rate': 1.118716577540107e-05, 'mean_token_accuracy': 0.9539277970790863, 'epoch': 4.41}
463
+ {'loss': 0.1801, 'grad_norm': 1.8535561561584473, 'learning_rate': 1.116577540106952e-05, 'mean_token_accuracy': 0.9474570631980896, 'epoch': 4.42}
464
+ {'loss': 0.1831, 'grad_norm': 1.4291088581085205, 'learning_rate': 1.1144385026737969e-05, 'mean_token_accuracy': 0.9447618722915649, 'epoch': 4.43}
465
+ {'loss': 0.1672, 'grad_norm': 1.625227689743042, 'learning_rate': 1.1122994652406418e-05, 'mean_token_accuracy': 0.9495197236537933, 'epoch': 4.44}
466
+ {'loss': 0.1971, 'grad_norm': 2.153144598007202, 'learning_rate': 1.1101604278074867e-05, 'mean_token_accuracy': 0.940097314119339, 'epoch': 4.45}
467
+ {'loss': 0.1775, 'grad_norm': 2.076204299926758, 'learning_rate': 1.1080213903743316e-05, 'mean_token_accuracy': 0.9468014240264893, 'epoch': 4.46}
468
+ {'loss': 0.1754, 'grad_norm': 1.476561427116394, 'learning_rate': 1.1058823529411766e-05, 'mean_token_accuracy': 0.9460311770439148, 'epoch': 4.47}
469
+ {'loss': 0.1668, 'grad_norm': 2.067399024963379, 'learning_rate': 1.1037433155080215e-05, 'mean_token_accuracy': 0.9486723065376281, 'epoch': 4.48}
470
+ {'loss': 0.1655, 'grad_norm': 1.3543881177902222, 'learning_rate': 1.1016042780748664e-05, 'mean_token_accuracy': 0.9500215172767639, 'epoch': 4.49}
471
+ {'loss': 0.1721, 'grad_norm': 1.7718254327774048, 'learning_rate': 1.0994652406417112e-05, 'mean_token_accuracy': 0.9470746457576752, 'epoch': 4.5}
472
+ {'loss': 0.1893, 'grad_norm': 1.6638853549957275, 'learning_rate': 1.0973262032085564e-05, 'mean_token_accuracy': 0.9455216109752655, 'epoch': 4.51}
473
+ {'loss': 0.1691, 'grad_norm': 1.9174784421920776, 'learning_rate': 1.0951871657754011e-05, 'mean_token_accuracy': 0.950607305765152, 'epoch': 4.52}
474
+ {'loss': 0.1883, 'grad_norm': 1.8073397874832153, 'learning_rate': 1.093048128342246e-05, 'mean_token_accuracy': 0.9437803864479065, 'epoch': 4.53}
475
+ {'loss': 0.1957, 'grad_norm': 1.5935677289962769, 'learning_rate': 1.0909090909090909e-05, 'mean_token_accuracy': 0.9387097239494324, 'epoch': 4.55}
476
+ {'loss': 0.1963, 'grad_norm': 1.7583671808242798, 'learning_rate': 1.088770053475936e-05, 'mean_token_accuracy': 0.9426051914691925, 'epoch': 4.56}
477
+ {'loss': 0.1719, 'grad_norm': 1.8803505897521973, 'learning_rate': 1.0866310160427808e-05, 'mean_token_accuracy': 0.9474845051765441, 'epoch': 4.57}
478
+ {'loss': 0.1934, 'grad_norm': 2.505721092224121, 'learning_rate': 1.0844919786096257e-05, 'mean_token_accuracy': 0.9428565561771393, 'epoch': 4.58}
479
+ {'loss': 0.1717, 'grad_norm': 1.605396032333374, 'learning_rate': 1.0823529411764706e-05, 'mean_token_accuracy': 0.9467872560024262, 'epoch': 4.59}
480
+ {'loss': 0.1819, 'grad_norm': 1.512045979499817, 'learning_rate': 1.0802139037433157e-05, 'mean_token_accuracy': 0.9463649451732635, 'epoch': 4.6}
481
+ {'loss': 0.2007, 'grad_norm': 2.0963022708892822, 'learning_rate': 1.0780748663101606e-05, 'mean_token_accuracy': 0.9346943676471711, 'epoch': 4.61}
482
+ {'loss': 0.1896, 'grad_norm': 1.7960457801818848, 'learning_rate': 1.0759358288770055e-05, 'mean_token_accuracy': 0.9400655329227448, 'epoch': 4.62}
483
+ {'loss': 0.1752, 'grad_norm': 1.6520633697509766, 'learning_rate': 1.0737967914438504e-05, 'mean_token_accuracy': 0.9477445900440216, 'epoch': 4.63}
484
+ {'loss': 0.1833, 'grad_norm': 1.9244734048843384, 'learning_rate': 1.0716577540106954e-05, 'mean_token_accuracy': 0.9424504935741425, 'epoch': 4.64}
485
+ {'loss': 0.1856, 'grad_norm': 1.5138581991195679, 'learning_rate': 1.0695187165775403e-05, 'mean_token_accuracy': 0.9422859311103821, 'epoch': 4.65}
486
+ {'loss': 0.1705, 'grad_norm': 1.9214117527008057, 'learning_rate': 1.067379679144385e-05, 'mean_token_accuracy': 0.9483125925064086, 'epoch': 4.66}
487
+ {'loss': 0.1704, 'grad_norm': 1.7379252910614014, 'learning_rate': 1.06524064171123e-05, 'mean_token_accuracy': 0.9481765508651734, 'epoch': 4.67}
488
+ {'loss': 0.1752, 'grad_norm': 1.642674446105957, 'learning_rate': 1.0631016042780748e-05, 'mean_token_accuracy': 0.9503016233444214, 'epoch': 4.68}
489
+ {'loss': 0.1581, 'grad_norm': 1.7870064973831177, 'learning_rate': 1.0609625668449199e-05, 'mean_token_accuracy': 0.9496883094310761, 'epoch': 4.7}
490
+ {'loss': 0.175, 'grad_norm': 1.789838433265686, 'learning_rate': 1.0588235294117648e-05, 'mean_token_accuracy': 0.9485034823417664, 'epoch': 4.71}
491
+ {'loss': 0.1902, 'grad_norm': 1.4657063484191895, 'learning_rate': 1.0566844919786097e-05, 'mean_token_accuracy': 0.9413743674755096, 'epoch': 4.72}
492
+ {'loss': 0.1655, 'grad_norm': 1.667333722114563, 'learning_rate': 1.0545454545454546e-05, 'mean_token_accuracy': 0.9504335641860961, 'epoch': 4.73}
493
+ {'loss': 0.1854, 'grad_norm': 2.1561086177825928, 'learning_rate': 1.0524064171122996e-05, 'mean_token_accuracy': 0.9446908056735992, 'epoch': 4.74}
494
+ {'loss': 0.1731, 'grad_norm': 2.2415318489074707, 'learning_rate': 1.0502673796791445e-05, 'mean_token_accuracy': 0.9452531158924102, 'epoch': 4.75}
495
+ {'loss': 0.1587, 'grad_norm': 1.6177361011505127, 'learning_rate': 1.0481283422459894e-05, 'mean_token_accuracy': 0.9538649678230285, 'epoch': 4.76}
496
+ {'loss': 0.1744, 'grad_norm': 1.8750890493392944, 'learning_rate': 1.0459893048128343e-05, 'mean_token_accuracy': 0.9453363537788391, 'epoch': 4.77}
497
+ {'loss': 0.1622, 'grad_norm': 1.6198896169662476, 'learning_rate': 1.0438502673796794e-05, 'mean_token_accuracy': 0.948049259185791, 'epoch': 4.78}
498
+ {'loss': 0.1739, 'grad_norm': 1.6797212362289429, 'learning_rate': 1.0417112299465243e-05, 'mean_token_accuracy': 0.9473335146903992, 'epoch': 4.79}
499
+ {'loss': 0.1505, 'grad_norm': 1.5805342197418213, 'learning_rate': 1.039572192513369e-05, 'mean_token_accuracy': 0.9550497591495514, 'epoch': 4.8}
500
+ {'loss': 0.1595, 'grad_norm': 2.010617971420288, 'learning_rate': 1.0374331550802139e-05, 'mean_token_accuracy': 0.9505820333957672, 'epoch': 4.81}
501
+ {'loss': 0.2022, 'grad_norm': 1.6834867000579834, 'learning_rate': 1.035294117647059e-05, 'mean_token_accuracy': 0.9358161687850952, 'epoch': 4.82}
502
+ {'loss': 0.2084, 'grad_norm': 2.1624224185943604, 'learning_rate': 1.0331550802139038e-05, 'mean_token_accuracy': 0.933703875541687, 'epoch': 4.83}
503
+ {'loss': 0.1979, 'grad_norm': 1.9009312391281128, 'learning_rate': 1.0310160427807487e-05, 'mean_token_accuracy': 0.9391363799571991, 'epoch': 4.84}
504
+ {'loss': 0.1759, 'grad_norm': 1.8317028284072876, 'learning_rate': 1.0288770053475936e-05, 'mean_token_accuracy': 0.9460435450077057, 'epoch': 4.86}
505
+ {'loss': 0.1523, 'grad_norm': 1.6002405881881714, 'learning_rate': 1.0267379679144387e-05, 'mean_token_accuracy': 0.9512880504131317, 'epoch': 4.87}
506
+ {'loss': 0.1784, 'grad_norm': 1.6229273080825806, 'learning_rate': 1.0245989304812836e-05, 'mean_token_accuracy': 0.9429602146148681, 'epoch': 4.88}
507
+ {'loss': 0.1722, 'grad_norm': 1.5719101428985596, 'learning_rate': 1.0224598930481285e-05, 'mean_token_accuracy': 0.9479642510414124, 'epoch': 4.89}
508
+ {'loss': 0.1756, 'grad_norm': 1.8305261135101318, 'learning_rate': 1.0203208556149734e-05, 'mean_token_accuracy': 0.9461706519126892, 'epoch': 4.9}
509
+ {'loss': 0.163, 'grad_norm': 1.7393313646316528, 'learning_rate': 1.0181818181818182e-05, 'mean_token_accuracy': 0.9523954331874848, 'epoch': 4.91}
510
+ {'loss': 0.1902, 'grad_norm': 1.8800948858261108, 'learning_rate': 1.0160427807486633e-05, 'mean_token_accuracy': 0.9459931075572967, 'epoch': 4.92}
511
+ {'loss': 0.1772, 'grad_norm': 1.6756889820098877, 'learning_rate': 1.0139037433155082e-05, 'mean_token_accuracy': 0.9421909153461456, 'epoch': 4.93}
512
+ {'loss': 0.1841, 'grad_norm': 1.647567868232727, 'learning_rate': 1.011764705882353e-05, 'mean_token_accuracy': 0.9408629298210144, 'epoch': 4.94}
513
+ {'loss': 0.1749, 'grad_norm': 2.1712002754211426, 'learning_rate': 1.0096256684491978e-05, 'mean_token_accuracy': 0.9471596658229828, 'epoch': 4.95}
514
+ {'loss': 0.1925, 'grad_norm': 2.217067241668701, 'learning_rate': 1.0074866310160429e-05, 'mean_token_accuracy': 0.9420334160327911, 'epoch': 4.96}
515
+ {'loss': 0.1837, 'grad_norm': 1.6498557329177856, 'learning_rate': 1.0053475935828878e-05, 'mean_token_accuracy': 0.943232637643814, 'epoch': 4.97}
516
+ {'loss': 0.1766, 'grad_norm': 1.7455774545669556, 'learning_rate': 1.0032085561497327e-05, 'mean_token_accuracy': 0.9466847658157349, 'epoch': 4.98}
517
+ {'loss': 0.176, 'grad_norm': 1.5071252584457397, 'learning_rate': 1.0010695187165776e-05, 'mean_token_accuracy': 0.9437964498996735, 'epoch': 4.99}
518
+ {'eval_loss': 0.9386810660362244, 'eval_runtime': 97.3878, 'eval_samples_per_second': 13.544, 'eval_steps_per_second': 0.852, 'eval_mean_token_accuracy': 0.8124508329413154, 'epoch': 5.0}
519
+ model-00001-of-00002.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4.96G/4.96G [03:10<00:00, 26.0MB/s]
520
+ Upload 5 LFS files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [03:10<00:00, 38.15s/it]3:09<00:00, 36.3MB/s]
521
+ run-e1n3xkh6.wandb: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 2.77M/3.47M [00:00<00:00, 18.8MB/s]
522
+
523
+
524
+ Upload 5 LFS files: 20%|β–ˆβ–ˆ | 1/5 [02:18<09:14, 138.52s/it]
525
+ Upload 5 LFS files: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3/5 [03:10<01:50, 55.26s/it]
wandb/run-20250402_145246-e1n3xkh6/run-e1n3xkh6.wandb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a083fa7c31ab607fb61a85d4739baad73234ed919cd42bec69f8e1bbd100a872
3
- size 2621440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c44f75d6a14404a1671103c98761a241b56e18c8a16b5f39c30a93b752bf2c4
3
+ size 3473408