metncelik commited on
Commit
d9339c1
·
verified ·
1 Parent(s): 0159e8c

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|endofline|>": 50259,
3
+ "<|endofsong|>": 50261,
4
+ "<|pad|>": 50257,
5
+ "<|startofline|>": 50258,
6
+ "<|startofsong|>": 50260
7
+ }
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 50
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.51.0",
36
+ "use_cache": true,
37
+ "vocab_size": 50262
38
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.51.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5c0caa5b22fe458e8a2043ecbb3fcaead2ef94184d368a5a08079fa20a86301
3
+ size 497789568
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41bd3f9bad85eb208c4251cc31c23675a54cce589c7fceb72ffd944b138d0b48
3
+ size 995673018
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbff58bc3d4797a1329aa4a9e623f270d002c625cb63efe7325288189b65fc10
3
+ size 14244
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b20682d68cd70b8b0b208c3bd5f62edfdf7322c345d5f8fb0fc9912581d14875
3
+ size 988
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eedf8430982bdd1e0d375cd000173068452f11c280e11e4047056f66bab10d96
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|startofline|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|endofline|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|startofsong|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<|endofsong|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ ],
32
+ "bos_token": {
33
+ "content": "<|endoftext|>",
34
+ "lstrip": false,
35
+ "normalized": true,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "eos_token": {
40
+ "content": "<|endoftext|>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "pad_token": {
47
+ "content": "<|pad|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ },
53
+ "unk_token": {
54
+ "content": "<|endoftext|>",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false
59
+ }
60
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": "<|pad|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "50258": {
22
+ "content": "<|startofline|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "50259": {
30
+ "content": "<|endofline|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "50260": {
38
+ "content": "<|startofsong|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "50261": {
46
+ "content": "<|endofsong|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<|startofline|>",
56
+ "<|endofline|>",
57
+ "<|startofsong|>",
58
+ "<|endofsong|>"
59
+ ],
60
+ "bos_token": "<|endoftext|>",
61
+ "clean_up_tokenization_spaces": true,
62
+ "eos_token": "<|endoftext|>",
63
+ "errors": "replace",
64
+ "extra_special_tokens": {},
65
+ "model_max_length": 1024,
66
+ "pad_token": "<|pad|>",
67
+ "tokenizer_class": "GPT2Tokenizer",
68
+ "unk_token": "<|endoftext|>"
69
+ }
trainer_state.json ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 4500,
3
+ "best_metric": 4.143945693969727,
4
+ "best_model_checkpoint": "checkpoints/checkpoint-4500",
5
+ "epoch": 4.999113362887904,
6
+ "eval_steps": 500,
7
+ "global_step": 4930,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.1013299556681444,
14
+ "grad_norm": 53.731651306152344,
15
+ "learning_rate": 9.600000000000001e-06,
16
+ "loss": 8.2301,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.2026599113362888,
21
+ "grad_norm": 60.03353500366211,
22
+ "learning_rate": 1.9600000000000002e-05,
23
+ "loss": 7.2402,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.3039898670044332,
28
+ "grad_norm": 56.0938606262207,
29
+ "learning_rate": 2.96e-05,
30
+ "loss": 6.1337,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.4053198226725776,
35
+ "grad_norm": 7.5981974601745605,
36
+ "learning_rate": 3.960000000000001e-05,
37
+ "loss": 5.0118,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.506649778340722,
42
+ "grad_norm": 3.361119270324707,
43
+ "learning_rate": 4.96e-05,
44
+ "loss": 4.6704,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.506649778340722,
49
+ "eval_loss": 4.494482517242432,
50
+ "eval_runtime": 61.163,
51
+ "eval_samples_per_second": 57.371,
52
+ "eval_steps_per_second": 14.355,
53
+ "step": 500
54
+ },
55
+ {
56
+ "epoch": 0.6079797340088664,
57
+ "grad_norm": 2.7135744094848633,
58
+ "learning_rate": 4.891647855530474e-05,
59
+ "loss": 4.642,
60
+ "step": 600
61
+ },
62
+ {
63
+ "epoch": 0.7093096896770108,
64
+ "grad_norm": 3.0142900943756104,
65
+ "learning_rate": 4.7787810383747176e-05,
66
+ "loss": 4.5258,
67
+ "step": 700
68
+ },
69
+ {
70
+ "epoch": 0.8106396453451552,
71
+ "grad_norm": 2.4529480934143066,
72
+ "learning_rate": 4.665914221218962e-05,
73
+ "loss": 4.5282,
74
+ "step": 800
75
+ },
76
+ {
77
+ "epoch": 0.9119696010132996,
78
+ "grad_norm": 2.8447184562683105,
79
+ "learning_rate": 4.553047404063205e-05,
80
+ "loss": 4.4796,
81
+ "step": 900
82
+ },
83
+ {
84
+ "epoch": 1.0141861937935401,
85
+ "grad_norm": 2.4824981689453125,
86
+ "learning_rate": 4.440180586907449e-05,
87
+ "loss": 4.5166,
88
+ "step": 1000
89
+ },
90
+ {
91
+ "epoch": 1.0141861937935401,
92
+ "eval_loss": 4.318243980407715,
93
+ "eval_runtime": 61.4718,
94
+ "eval_samples_per_second": 57.083,
95
+ "eval_steps_per_second": 14.283,
96
+ "step": 1000
97
+ },
98
+ {
99
+ "epoch": 1.1155161494616845,
100
+ "grad_norm": 2.3135533332824707,
101
+ "learning_rate": 4.327313769751693e-05,
102
+ "loss": 4.41,
103
+ "step": 1100
104
+ },
105
+ {
106
+ "epoch": 1.216846105129829,
107
+ "grad_norm": 2.297306537628174,
108
+ "learning_rate": 4.214446952595937e-05,
109
+ "loss": 4.429,
110
+ "step": 1200
111
+ },
112
+ {
113
+ "epoch": 1.3181760607979733,
114
+ "grad_norm": 2.43269944190979,
115
+ "learning_rate": 4.101580135440181e-05,
116
+ "loss": 4.3426,
117
+ "step": 1300
118
+ },
119
+ {
120
+ "epoch": 1.4195060164661177,
121
+ "grad_norm": 2.2192583084106445,
122
+ "learning_rate": 3.988713318284424e-05,
123
+ "loss": 4.3375,
124
+ "step": 1400
125
+ },
126
+ {
127
+ "epoch": 1.5208359721342621,
128
+ "grad_norm": 2.4804940223693848,
129
+ "learning_rate": 3.875846501128668e-05,
130
+ "loss": 4.3412,
131
+ "step": 1500
132
+ },
133
+ {
134
+ "epoch": 1.5208359721342621,
135
+ "eval_loss": 4.256350994110107,
136
+ "eval_runtime": 61.1509,
137
+ "eval_samples_per_second": 57.383,
138
+ "eval_steps_per_second": 14.358,
139
+ "step": 1500
140
+ },
141
+ {
142
+ "epoch": 1.6221659278024065,
143
+ "grad_norm": 2.0531516075134277,
144
+ "learning_rate": 3.762979683972912e-05,
145
+ "loss": 4.3421,
146
+ "step": 1600
147
+ },
148
+ {
149
+ "epoch": 1.723495883470551,
150
+ "grad_norm": 2.3188295364379883,
151
+ "learning_rate": 3.650112866817156e-05,
152
+ "loss": 4.3078,
153
+ "step": 1700
154
+ },
155
+ {
156
+ "epoch": 1.8248258391386953,
157
+ "grad_norm": 2.002288341522217,
158
+ "learning_rate": 3.5372460496614e-05,
159
+ "loss": 4.3323,
160
+ "step": 1800
161
+ },
162
+ {
163
+ "epoch": 1.9261557948068397,
164
+ "grad_norm": 2.439591646194458,
165
+ "learning_rate": 3.424379232505643e-05,
166
+ "loss": 4.3238,
167
+ "step": 1900
168
+ },
169
+ {
170
+ "epoch": 2.0283723875870803,
171
+ "grad_norm": 2.314893960952759,
172
+ "learning_rate": 3.3115124153498873e-05,
173
+ "loss": 4.3637,
174
+ "step": 2000
175
+ },
176
+ {
177
+ "epoch": 2.0283723875870803,
178
+ "eval_loss": 4.224379539489746,
179
+ "eval_runtime": 61.2066,
180
+ "eval_samples_per_second": 57.33,
181
+ "eval_steps_per_second": 14.345,
182
+ "step": 2000
183
+ },
184
+ {
185
+ "epoch": 2.1297023432552247,
186
+ "grad_norm": 2.339419364929199,
187
+ "learning_rate": 3.198645598194131e-05,
188
+ "loss": 4.2702,
189
+ "step": 2100
190
+ },
191
+ {
192
+ "epoch": 2.231032298923369,
193
+ "grad_norm": 2.3098413944244385,
194
+ "learning_rate": 3.085778781038375e-05,
195
+ "loss": 4.2843,
196
+ "step": 2200
197
+ },
198
+ {
199
+ "epoch": 2.3323622545915135,
200
+ "grad_norm": 2.1247897148132324,
201
+ "learning_rate": 2.9729119638826186e-05,
202
+ "loss": 4.2886,
203
+ "step": 2300
204
+ },
205
+ {
206
+ "epoch": 2.433692210259658,
207
+ "grad_norm": 2.2844860553741455,
208
+ "learning_rate": 2.8600451467268623e-05,
209
+ "loss": 4.2878,
210
+ "step": 2400
211
+ },
212
+ {
213
+ "epoch": 2.5350221659278023,
214
+ "grad_norm": 2.0234220027923584,
215
+ "learning_rate": 2.747178329571106e-05,
216
+ "loss": 4.2488,
217
+ "step": 2500
218
+ },
219
+ {
220
+ "epoch": 2.5350221659278023,
221
+ "eval_loss": 4.1915154457092285,
222
+ "eval_runtime": 61.5807,
223
+ "eval_samples_per_second": 56.982,
224
+ "eval_steps_per_second": 14.258,
225
+ "step": 2500
226
+ },
227
+ {
228
+ "epoch": 2.6363521215959467,
229
+ "grad_norm": 2.2979116439819336,
230
+ "learning_rate": 2.63431151241535e-05,
231
+ "loss": 4.2168,
232
+ "step": 2600
233
+ },
234
+ {
235
+ "epoch": 2.737682077264091,
236
+ "grad_norm": 2.3979334831237793,
237
+ "learning_rate": 2.521444695259594e-05,
238
+ "loss": 4.2475,
239
+ "step": 2700
240
+ },
241
+ {
242
+ "epoch": 2.8390120329322355,
243
+ "grad_norm": 2.207998037338257,
244
+ "learning_rate": 2.4085778781038376e-05,
245
+ "loss": 4.2249,
246
+ "step": 2800
247
+ },
248
+ {
249
+ "epoch": 2.94034198860038,
250
+ "grad_norm": 2.1592469215393066,
251
+ "learning_rate": 2.2957110609480814e-05,
252
+ "loss": 4.2234,
253
+ "step": 2900
254
+ },
255
+ {
256
+ "epoch": 3.0425585813806206,
257
+ "grad_norm": 2.058875560760498,
258
+ "learning_rate": 2.182844243792325e-05,
259
+ "loss": 4.2611,
260
+ "step": 3000
261
+ },
262
+ {
263
+ "epoch": 3.0425585813806206,
264
+ "eval_loss": 4.1719160079956055,
265
+ "eval_runtime": 61.4253,
266
+ "eval_samples_per_second": 57.126,
267
+ "eval_steps_per_second": 14.294,
268
+ "step": 3000
269
+ },
270
+ {
271
+ "epoch": 3.143888537048765,
272
+ "grad_norm": 2.292440414428711,
273
+ "learning_rate": 2.069977426636569e-05,
274
+ "loss": 4.1453,
275
+ "step": 3100
276
+ },
277
+ {
278
+ "epoch": 3.2452184927169094,
279
+ "grad_norm": 2.633338451385498,
280
+ "learning_rate": 1.957110609480813e-05,
281
+ "loss": 4.182,
282
+ "step": 3200
283
+ },
284
+ {
285
+ "epoch": 3.346548448385054,
286
+ "grad_norm": 2.1391022205352783,
287
+ "learning_rate": 1.8442437923250567e-05,
288
+ "loss": 4.1905,
289
+ "step": 3300
290
+ },
291
+ {
292
+ "epoch": 3.4478784040531982,
293
+ "grad_norm": 2.2888920307159424,
294
+ "learning_rate": 1.7313769751693004e-05,
295
+ "loss": 4.1835,
296
+ "step": 3400
297
+ },
298
+ {
299
+ "epoch": 3.5492083597213426,
300
+ "grad_norm": 2.186450481414795,
301
+ "learning_rate": 1.6185101580135442e-05,
302
+ "loss": 4.2083,
303
+ "step": 3500
304
+ },
305
+ {
306
+ "epoch": 3.5492083597213426,
307
+ "eval_loss": 4.161413192749023,
308
+ "eval_runtime": 61.3496,
309
+ "eval_samples_per_second": 57.197,
310
+ "eval_steps_per_second": 14.311,
311
+ "step": 3500
312
+ },
313
+ {
314
+ "epoch": 3.650538315389487,
315
+ "grad_norm": 2.072542190551758,
316
+ "learning_rate": 1.5056433408577881e-05,
317
+ "loss": 4.2393,
318
+ "step": 3600
319
+ },
320
+ {
321
+ "epoch": 3.7518682710576314,
322
+ "grad_norm": 2.1177375316619873,
323
+ "learning_rate": 1.3927765237020315e-05,
324
+ "loss": 4.2243,
325
+ "step": 3700
326
+ },
327
+ {
328
+ "epoch": 3.853198226725776,
329
+ "grad_norm": 2.2772135734558105,
330
+ "learning_rate": 1.2799097065462754e-05,
331
+ "loss": 4.2019,
332
+ "step": 3800
333
+ },
334
+ {
335
+ "epoch": 3.9545281823939202,
336
+ "grad_norm": 2.0697269439697266,
337
+ "learning_rate": 1.1670428893905193e-05,
338
+ "loss": 4.2285,
339
+ "step": 3900
340
+ },
341
+ {
342
+ "epoch": 4.0567447751741605,
343
+ "grad_norm": 2.685513734817505,
344
+ "learning_rate": 1.054176072234763e-05,
345
+ "loss": 4.2405,
346
+ "step": 4000
347
+ },
348
+ {
349
+ "epoch": 4.0567447751741605,
350
+ "eval_loss": 4.150519847869873,
351
+ "eval_runtime": 61.2476,
352
+ "eval_samples_per_second": 57.292,
353
+ "eval_steps_per_second": 14.335,
354
+ "step": 4000
355
+ },
356
+ {
357
+ "epoch": 4.158074730842305,
358
+ "grad_norm": 2.0102524757385254,
359
+ "learning_rate": 9.413092550790068e-06,
360
+ "loss": 4.1516,
361
+ "step": 4100
362
+ },
363
+ {
364
+ "epoch": 4.259404686510449,
365
+ "grad_norm": 2.008261203765869,
366
+ "learning_rate": 8.284424379232506e-06,
367
+ "loss": 4.145,
368
+ "step": 4200
369
+ },
370
+ {
371
+ "epoch": 4.360734642178594,
372
+ "grad_norm": 2.0506038665771484,
373
+ "learning_rate": 7.155756207674943e-06,
374
+ "loss": 4.1747,
375
+ "step": 4300
376
+ },
377
+ {
378
+ "epoch": 4.462064597846738,
379
+ "grad_norm": 2.1455721855163574,
380
+ "learning_rate": 6.0270880361173815e-06,
381
+ "loss": 4.1857,
382
+ "step": 4400
383
+ },
384
+ {
385
+ "epoch": 4.5633945535148825,
386
+ "grad_norm": 2.101816177368164,
387
+ "learning_rate": 4.89841986455982e-06,
388
+ "loss": 4.1556,
389
+ "step": 4500
390
+ },
391
+ {
392
+ "epoch": 4.5633945535148825,
393
+ "eval_loss": 4.143945693969727,
394
+ "eval_runtime": 61.2556,
395
+ "eval_samples_per_second": 57.285,
396
+ "eval_steps_per_second": 14.333,
397
+ "step": 4500
398
+ },
399
+ {
400
+ "epoch": 4.664724509183027,
401
+ "grad_norm": 2.203230619430542,
402
+ "learning_rate": 3.7697516930022577e-06,
403
+ "loss": 4.1613,
404
+ "step": 4600
405
+ },
406
+ {
407
+ "epoch": 4.766054464851171,
408
+ "grad_norm": 1.9926562309265137,
409
+ "learning_rate": 2.6410835214446955e-06,
410
+ "loss": 4.2135,
411
+ "step": 4700
412
+ },
413
+ {
414
+ "epoch": 4.867384420519316,
415
+ "grad_norm": 2.189359426498413,
416
+ "learning_rate": 1.5124153498871334e-06,
417
+ "loss": 4.1631,
418
+ "step": 4800
419
+ },
420
+ {
421
+ "epoch": 4.96871437618746,
422
+ "grad_norm": 2.3265554904937744,
423
+ "learning_rate": 3.837471783295711e-07,
424
+ "loss": 4.1366,
425
+ "step": 4900
426
+ }
427
+ ],
428
+ "logging_steps": 100,
429
+ "max_steps": 4930,
430
+ "num_input_tokens_seen": 0,
431
+ "num_train_epochs": 5,
432
+ "save_steps": 500,
433
+ "stateful_callbacks": {
434
+ "TrainerControl": {
435
+ "args": {
436
+ "should_epoch_stop": false,
437
+ "should_evaluate": false,
438
+ "should_log": false,
439
+ "should_save": true,
440
+ "should_training_stop": true
441
+ },
442
+ "attributes": {}
443
+ }
444
+ },
445
+ "total_flos": 8.2088082473472e+16,
446
+ "train_batch_size": 4,
447
+ "trial_name": null,
448
+ "trial_params": null
449
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5092045063b6078b36cb9bc3ca73af025833eebd5942cbab8ac1893d81ff270
3
+ size 5240
vocab.json ADDED
The diff for this file is too large to render. See raw diff