metncelik commited on
Commit
2b29889
·
verified ·
1 Parent(s): d9339c1

Upload folder using huggingface_hub

Browse files
added_tokens.json CHANGED
@@ -1,7 +1,3 @@
1
  {
2
- "<|endofline|>": 50259,
3
- "<|endofsong|>": 50261,
4
- "<|pad|>": 50257,
5
- "<|startofline|>": 50258,
6
- "<|startofsong|>": 50260
7
  }
 
1
  {
2
+ "<|pad|>": 50257
 
 
 
 
3
  }
config.json CHANGED
@@ -32,7 +32,7 @@
32
  }
33
  },
34
  "torch_dtype": "float32",
35
- "transformers_version": "4.51.0",
36
  "use_cache": true,
37
- "vocab_size": 50262
38
  }
 
32
  }
33
  },
34
  "torch_dtype": "float32",
35
+ "transformers_version": "4.51.3",
36
  "use_cache": true,
37
+ "vocab_size": 50258
38
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 50256,
4
  "eos_token_id": 50256,
5
- "transformers_version": "4.51.0"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 50256,
4
  "eos_token_id": 50256,
5
+ "transformers_version": "4.51.3"
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5c0caa5b22fe458e8a2043ecbb3fcaead2ef94184d368a5a08079fa20a86301
3
- size 497789568
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c09e23fb1f2834ca7fdbd63c9a0d1a894928f61dc6e29cef7522dda4a5d9a90
3
+ size 497777280
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41bd3f9bad85eb208c4251cc31c23675a54cce589c7fceb72ffd944b138d0b48
3
- size 995673018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3e39049298bf8caa4a0206af22d195193cc6a385b34b9870a35b9bcb7458df2
3
+ size 995648442
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbff58bc3d4797a1329aa4a9e623f270d002c625cb63efe7325288189b65fc10
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a1c7dd0b3b463e2c58e2f44cbd38e25c00d34646a5c4e3ef2f3793b6c7a746a
3
  size 14244
special_tokens_map.json CHANGED
@@ -1,34 +1,4 @@
1
  {
2
- "additional_special_tokens": [
3
- {
4
- "content": "<|startofline|>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "<|endofline|>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
- {
18
- "content": "<|startofsong|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- {
25
- "content": "<|endofsong|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- ],
32
  "bos_token": {
33
  "content": "<|endoftext|>",
34
  "lstrip": false,
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<|endoftext|>",
4
  "lstrip": false,
tokenizer.json CHANGED
@@ -32,42 +32,6 @@
32
  "rstrip": false,
33
  "normalized": false,
34
  "special": true
35
- },
36
- {
37
- "id": 50258,
38
- "content": "<|startofline|>",
39
- "single_word": false,
40
- "lstrip": false,
41
- "rstrip": false,
42
- "normalized": false,
43
- "special": true
44
- },
45
- {
46
- "id": 50259,
47
- "content": "<|endofline|>",
48
- "single_word": false,
49
- "lstrip": false,
50
- "rstrip": false,
51
- "normalized": false,
52
- "special": true
53
- },
54
- {
55
- "id": 50260,
56
- "content": "<|startofsong|>",
57
- "single_word": false,
58
- "lstrip": false,
59
- "rstrip": false,
60
- "normalized": false,
61
- "special": true
62
- },
63
- {
64
- "id": 50261,
65
- "content": "<|endofsong|>",
66
- "single_word": false,
67
- "lstrip": false,
68
- "rstrip": false,
69
- "normalized": false,
70
- "special": true
71
  }
72
  ],
73
  "normalizer": null,
 
32
  "rstrip": false,
33
  "normalized": false,
34
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
  ],
37
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -17,46 +17,8 @@
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
- },
21
- "50258": {
22
- "content": "<|startofline|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "50259": {
30
- "content": "<|endofline|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "50260": {
38
- "content": "<|startofsong|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "50261": {
46
- "content": "<|endofsong|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
  }
53
  },
54
- "additional_special_tokens": [
55
- "<|startofline|>",
56
- "<|endofline|>",
57
- "<|startofsong|>",
58
- "<|endofsong|>"
59
- ],
60
  "bos_token": "<|endoftext|>",
61
  "clean_up_tokenization_spaces": true,
62
  "eos_token": "<|endoftext|>",
 
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  }
21
  },
 
 
 
 
 
 
22
  "bos_token": "<|endoftext|>",
23
  "clean_up_tokenization_spaces": true,
24
  "eos_token": "<|endoftext|>",
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 4500,
3
- "best_metric": 4.143945693969727,
4
  "best_model_checkpoint": "checkpoints/checkpoint-4500",
5
  "epoch": 4.999113362887904,
6
  "eval_steps": 500,
@@ -11,417 +11,417 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.1013299556681444,
14
- "grad_norm": 53.731651306152344,
15
  "learning_rate": 9.600000000000001e-06,
16
- "loss": 8.2301,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 0.2026599113362888,
21
- "grad_norm": 60.03353500366211,
22
  "learning_rate": 1.9600000000000002e-05,
23
- "loss": 7.2402,
24
  "step": 200
25
  },
26
  {
27
  "epoch": 0.3039898670044332,
28
- "grad_norm": 56.0938606262207,
29
  "learning_rate": 2.96e-05,
30
- "loss": 6.1337,
31
  "step": 300
32
  },
33
  {
34
  "epoch": 0.4053198226725776,
35
- "grad_norm": 7.5981974601745605,
36
  "learning_rate": 3.960000000000001e-05,
37
- "loss": 5.0118,
38
  "step": 400
39
  },
40
  {
41
  "epoch": 0.506649778340722,
42
- "grad_norm": 3.361119270324707,
43
  "learning_rate": 4.96e-05,
44
- "loss": 4.6704,
45
  "step": 500
46
  },
47
  {
48
  "epoch": 0.506649778340722,
49
- "eval_loss": 4.494482517242432,
50
- "eval_runtime": 61.163,
51
- "eval_samples_per_second": 57.371,
52
- "eval_steps_per_second": 14.355,
53
  "step": 500
54
  },
55
  {
56
  "epoch": 0.6079797340088664,
57
- "grad_norm": 2.7135744094848633,
58
  "learning_rate": 4.891647855530474e-05,
59
- "loss": 4.642,
60
  "step": 600
61
  },
62
  {
63
  "epoch": 0.7093096896770108,
64
- "grad_norm": 3.0142900943756104,
65
  "learning_rate": 4.7787810383747176e-05,
66
- "loss": 4.5258,
67
  "step": 700
68
  },
69
  {
70
  "epoch": 0.8106396453451552,
71
- "grad_norm": 2.4529480934143066,
72
  "learning_rate": 4.665914221218962e-05,
73
- "loss": 4.5282,
74
  "step": 800
75
  },
76
  {
77
  "epoch": 0.9119696010132996,
78
- "grad_norm": 2.8447184562683105,
79
  "learning_rate": 4.553047404063205e-05,
80
- "loss": 4.4796,
81
  "step": 900
82
  },
83
  {
84
  "epoch": 1.0141861937935401,
85
- "grad_norm": 2.4824981689453125,
86
  "learning_rate": 4.440180586907449e-05,
87
- "loss": 4.5166,
88
  "step": 1000
89
  },
90
  {
91
  "epoch": 1.0141861937935401,
92
- "eval_loss": 4.318243980407715,
93
- "eval_runtime": 61.4718,
94
- "eval_samples_per_second": 57.083,
95
- "eval_steps_per_second": 14.283,
96
  "step": 1000
97
  },
98
  {
99
  "epoch": 1.1155161494616845,
100
- "grad_norm": 2.3135533332824707,
101
  "learning_rate": 4.327313769751693e-05,
102
- "loss": 4.41,
103
  "step": 1100
104
  },
105
  {
106
  "epoch": 1.216846105129829,
107
- "grad_norm": 2.297306537628174,
108
  "learning_rate": 4.214446952595937e-05,
109
- "loss": 4.429,
110
  "step": 1200
111
  },
112
  {
113
  "epoch": 1.3181760607979733,
114
- "grad_norm": 2.43269944190979,
115
  "learning_rate": 4.101580135440181e-05,
116
- "loss": 4.3426,
117
  "step": 1300
118
  },
119
  {
120
  "epoch": 1.4195060164661177,
121
- "grad_norm": 2.2192583084106445,
122
  "learning_rate": 3.988713318284424e-05,
123
- "loss": 4.3375,
124
  "step": 1400
125
  },
126
  {
127
  "epoch": 1.5208359721342621,
128
- "grad_norm": 2.4804940223693848,
129
  "learning_rate": 3.875846501128668e-05,
130
- "loss": 4.3412,
131
  "step": 1500
132
  },
133
  {
134
  "epoch": 1.5208359721342621,
135
- "eval_loss": 4.256350994110107,
136
- "eval_runtime": 61.1509,
137
- "eval_samples_per_second": 57.383,
138
- "eval_steps_per_second": 14.358,
139
  "step": 1500
140
  },
141
  {
142
  "epoch": 1.6221659278024065,
143
- "grad_norm": 2.0531516075134277,
144
  "learning_rate": 3.762979683972912e-05,
145
- "loss": 4.3421,
146
  "step": 1600
147
  },
148
  {
149
  "epoch": 1.723495883470551,
150
- "grad_norm": 2.3188295364379883,
151
  "learning_rate": 3.650112866817156e-05,
152
- "loss": 4.3078,
153
  "step": 1700
154
  },
155
  {
156
  "epoch": 1.8248258391386953,
157
- "grad_norm": 2.002288341522217,
158
  "learning_rate": 3.5372460496614e-05,
159
- "loss": 4.3323,
160
  "step": 1800
161
  },
162
  {
163
  "epoch": 1.9261557948068397,
164
- "grad_norm": 2.439591646194458,
165
  "learning_rate": 3.424379232505643e-05,
166
- "loss": 4.3238,
167
  "step": 1900
168
  },
169
  {
170
  "epoch": 2.0283723875870803,
171
- "grad_norm": 2.314893960952759,
172
  "learning_rate": 3.3115124153498873e-05,
173
- "loss": 4.3637,
174
  "step": 2000
175
  },
176
  {
177
  "epoch": 2.0283723875870803,
178
- "eval_loss": 4.224379539489746,
179
- "eval_runtime": 61.2066,
180
- "eval_samples_per_second": 57.33,
181
- "eval_steps_per_second": 14.345,
182
  "step": 2000
183
  },
184
  {
185
  "epoch": 2.1297023432552247,
186
- "grad_norm": 2.339419364929199,
187
  "learning_rate": 3.198645598194131e-05,
188
- "loss": 4.2702,
189
  "step": 2100
190
  },
191
  {
192
  "epoch": 2.231032298923369,
193
- "grad_norm": 2.3098413944244385,
194
  "learning_rate": 3.085778781038375e-05,
195
- "loss": 4.2843,
196
  "step": 2200
197
  },
198
  {
199
  "epoch": 2.3323622545915135,
200
- "grad_norm": 2.1247897148132324,
201
  "learning_rate": 2.9729119638826186e-05,
202
- "loss": 4.2886,
203
  "step": 2300
204
  },
205
  {
206
  "epoch": 2.433692210259658,
207
- "grad_norm": 2.2844860553741455,
208
  "learning_rate": 2.8600451467268623e-05,
209
- "loss": 4.2878,
210
  "step": 2400
211
  },
212
  {
213
  "epoch": 2.5350221659278023,
214
- "grad_norm": 2.0234220027923584,
215
  "learning_rate": 2.747178329571106e-05,
216
- "loss": 4.2488,
217
  "step": 2500
218
  },
219
  {
220
  "epoch": 2.5350221659278023,
221
- "eval_loss": 4.1915154457092285,
222
- "eval_runtime": 61.5807,
223
- "eval_samples_per_second": 56.982,
224
- "eval_steps_per_second": 14.258,
225
  "step": 2500
226
  },
227
  {
228
  "epoch": 2.6363521215959467,
229
- "grad_norm": 2.2979116439819336,
230
  "learning_rate": 2.63431151241535e-05,
231
- "loss": 4.2168,
232
  "step": 2600
233
  },
234
  {
235
  "epoch": 2.737682077264091,
236
- "grad_norm": 2.3979334831237793,
237
  "learning_rate": 2.521444695259594e-05,
238
- "loss": 4.2475,
239
  "step": 2700
240
  },
241
  {
242
  "epoch": 2.8390120329322355,
243
- "grad_norm": 2.207998037338257,
244
  "learning_rate": 2.4085778781038376e-05,
245
- "loss": 4.2249,
246
  "step": 2800
247
  },
248
  {
249
  "epoch": 2.94034198860038,
250
- "grad_norm": 2.1592469215393066,
251
  "learning_rate": 2.2957110609480814e-05,
252
- "loss": 4.2234,
253
  "step": 2900
254
  },
255
  {
256
  "epoch": 3.0425585813806206,
257
- "grad_norm": 2.058875560760498,
258
  "learning_rate": 2.182844243792325e-05,
259
- "loss": 4.2611,
260
  "step": 3000
261
  },
262
  {
263
  "epoch": 3.0425585813806206,
264
- "eval_loss": 4.1719160079956055,
265
- "eval_runtime": 61.4253,
266
- "eval_samples_per_second": 57.126,
267
- "eval_steps_per_second": 14.294,
268
  "step": 3000
269
  },
270
  {
271
  "epoch": 3.143888537048765,
272
- "grad_norm": 2.292440414428711,
273
  "learning_rate": 2.069977426636569e-05,
274
- "loss": 4.1453,
275
  "step": 3100
276
  },
277
  {
278
  "epoch": 3.2452184927169094,
279
- "grad_norm": 2.633338451385498,
280
  "learning_rate": 1.957110609480813e-05,
281
- "loss": 4.182,
282
  "step": 3200
283
  },
284
  {
285
  "epoch": 3.346548448385054,
286
- "grad_norm": 2.1391022205352783,
287
  "learning_rate": 1.8442437923250567e-05,
288
- "loss": 4.1905,
289
  "step": 3300
290
  },
291
  {
292
  "epoch": 3.4478784040531982,
293
- "grad_norm": 2.2888920307159424,
294
  "learning_rate": 1.7313769751693004e-05,
295
- "loss": 4.1835,
296
  "step": 3400
297
  },
298
  {
299
  "epoch": 3.5492083597213426,
300
- "grad_norm": 2.186450481414795,
301
  "learning_rate": 1.6185101580135442e-05,
302
- "loss": 4.2083,
303
  "step": 3500
304
  },
305
  {
306
  "epoch": 3.5492083597213426,
307
- "eval_loss": 4.161413192749023,
308
- "eval_runtime": 61.3496,
309
- "eval_samples_per_second": 57.197,
310
- "eval_steps_per_second": 14.311,
311
  "step": 3500
312
  },
313
  {
314
  "epoch": 3.650538315389487,
315
- "grad_norm": 2.072542190551758,
316
  "learning_rate": 1.5056433408577881e-05,
317
- "loss": 4.2393,
318
  "step": 3600
319
  },
320
  {
321
  "epoch": 3.7518682710576314,
322
- "grad_norm": 2.1177375316619873,
323
  "learning_rate": 1.3927765237020315e-05,
324
- "loss": 4.2243,
325
  "step": 3700
326
  },
327
  {
328
  "epoch": 3.853198226725776,
329
- "grad_norm": 2.2772135734558105,
330
  "learning_rate": 1.2799097065462754e-05,
331
- "loss": 4.2019,
332
  "step": 3800
333
  },
334
  {
335
  "epoch": 3.9545281823939202,
336
- "grad_norm": 2.0697269439697266,
337
  "learning_rate": 1.1670428893905193e-05,
338
- "loss": 4.2285,
339
  "step": 3900
340
  },
341
  {
342
  "epoch": 4.0567447751741605,
343
- "grad_norm": 2.685513734817505,
344
  "learning_rate": 1.054176072234763e-05,
345
- "loss": 4.2405,
346
  "step": 4000
347
  },
348
  {
349
  "epoch": 4.0567447751741605,
350
- "eval_loss": 4.150519847869873,
351
- "eval_runtime": 61.2476,
352
- "eval_samples_per_second": 57.292,
353
- "eval_steps_per_second": 14.335,
354
  "step": 4000
355
  },
356
  {
357
  "epoch": 4.158074730842305,
358
- "grad_norm": 2.0102524757385254,
359
  "learning_rate": 9.413092550790068e-06,
360
- "loss": 4.1516,
361
  "step": 4100
362
  },
363
  {
364
  "epoch": 4.259404686510449,
365
- "grad_norm": 2.008261203765869,
366
  "learning_rate": 8.284424379232506e-06,
367
- "loss": 4.145,
368
  "step": 4200
369
  },
370
  {
371
  "epoch": 4.360734642178594,
372
- "grad_norm": 2.0506038665771484,
373
  "learning_rate": 7.155756207674943e-06,
374
- "loss": 4.1747,
375
  "step": 4300
376
  },
377
  {
378
  "epoch": 4.462064597846738,
379
- "grad_norm": 2.1455721855163574,
380
  "learning_rate": 6.0270880361173815e-06,
381
- "loss": 4.1857,
382
  "step": 4400
383
  },
384
  {
385
  "epoch": 4.5633945535148825,
386
- "grad_norm": 2.101816177368164,
387
  "learning_rate": 4.89841986455982e-06,
388
- "loss": 4.1556,
389
  "step": 4500
390
  },
391
  {
392
  "epoch": 4.5633945535148825,
393
- "eval_loss": 4.143945693969727,
394
- "eval_runtime": 61.2556,
395
- "eval_samples_per_second": 57.285,
396
- "eval_steps_per_second": 14.333,
397
  "step": 4500
398
  },
399
  {
400
  "epoch": 4.664724509183027,
401
- "grad_norm": 2.203230619430542,
402
  "learning_rate": 3.7697516930022577e-06,
403
- "loss": 4.1613,
404
  "step": 4600
405
  },
406
  {
407
  "epoch": 4.766054464851171,
408
- "grad_norm": 1.9926562309265137,
409
  "learning_rate": 2.6410835214446955e-06,
410
- "loss": 4.2135,
411
  "step": 4700
412
  },
413
  {
414
  "epoch": 4.867384420519316,
415
- "grad_norm": 2.189359426498413,
416
  "learning_rate": 1.5124153498871334e-06,
417
- "loss": 4.1631,
418
  "step": 4800
419
  },
420
  {
421
  "epoch": 4.96871437618746,
422
- "grad_norm": 2.3265554904937744,
423
  "learning_rate": 3.837471783295711e-07,
424
- "loss": 4.1366,
425
  "step": 4900
426
  }
427
  ],
@@ -442,7 +442,7 @@
442
  "attributes": {}
443
  }
444
  },
445
- "total_flos": 8.2088082473472e+16,
446
  "train_batch_size": 4,
447
  "trial_name": null,
448
  "trial_params": null
 
1
  {
2
  "best_global_step": 4500,
3
+ "best_metric": 4.731945037841797,
4
  "best_model_checkpoint": "checkpoints/checkpoint-4500",
5
  "epoch": 4.999113362887904,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.1013299556681444,
14
+ "grad_norm": 4.671788692474365,
15
  "learning_rate": 9.600000000000001e-06,
16
+ "loss": 9.818,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 0.2026599113362888,
21
+ "grad_norm": 2.0833730697631836,
22
  "learning_rate": 1.9600000000000002e-05,
23
+ "loss": 6.0125,
24
  "step": 200
25
  },
26
  {
27
  "epoch": 0.3039898670044332,
28
+ "grad_norm": 2.2255606651306152,
29
  "learning_rate": 2.96e-05,
30
+ "loss": 5.5478,
31
  "step": 300
32
  },
33
  {
34
  "epoch": 0.4053198226725776,
35
+ "grad_norm": 2.375220537185669,
36
  "learning_rate": 3.960000000000001e-05,
37
+ "loss": 5.3821,
38
  "step": 400
39
  },
40
  {
41
  "epoch": 0.506649778340722,
42
+ "grad_norm": 2.1432723999023438,
43
  "learning_rate": 4.96e-05,
44
+ "loss": 5.2798,
45
  "step": 500
46
  },
47
  {
48
  "epoch": 0.506649778340722,
49
+ "eval_loss": 5.099383354187012,
50
+ "eval_runtime": 61.4753,
51
+ "eval_samples_per_second": 57.08,
52
+ "eval_steps_per_second": 14.282,
53
  "step": 500
54
  },
55
  {
56
  "epoch": 0.6079797340088664,
57
+ "grad_norm": 2.5531654357910156,
58
  "learning_rate": 4.891647855530474e-05,
59
+ "loss": 5.2792,
60
  "step": 600
61
  },
62
  {
63
  "epoch": 0.7093096896770108,
64
+ "grad_norm": 1.8546665906906128,
65
  "learning_rate": 4.7787810383747176e-05,
66
+ "loss": 5.1646,
67
  "step": 700
68
  },
69
  {
70
  "epoch": 0.8106396453451552,
71
+ "grad_norm": 2.103972911834717,
72
  "learning_rate": 4.665914221218962e-05,
73
+ "loss": 5.1678,
74
  "step": 800
75
  },
76
  {
77
  "epoch": 0.9119696010132996,
78
+ "grad_norm": 2.0656638145446777,
79
  "learning_rate": 4.553047404063205e-05,
80
+ "loss": 5.1145,
81
  "step": 900
82
  },
83
  {
84
  "epoch": 1.0141861937935401,
85
+ "grad_norm": 2.003814697265625,
86
  "learning_rate": 4.440180586907449e-05,
87
+ "loss": 5.1537,
88
  "step": 1000
89
  },
90
  {
91
  "epoch": 1.0141861937935401,
92
+ "eval_loss": 4.938778877258301,
93
+ "eval_runtime": 61.4905,
94
+ "eval_samples_per_second": 57.066,
95
+ "eval_steps_per_second": 14.279,
96
  "step": 1000
97
  },
98
  {
99
  "epoch": 1.1155161494616845,
100
+ "grad_norm": 2.0927860736846924,
101
  "learning_rate": 4.327313769751693e-05,
102
+ "loss": 5.0454,
103
  "step": 1100
104
  },
105
  {
106
  "epoch": 1.216846105129829,
107
+ "grad_norm": 2.080012321472168,
108
  "learning_rate": 4.214446952595937e-05,
109
+ "loss": 5.0665,
110
  "step": 1200
111
  },
112
  {
113
  "epoch": 1.3181760607979733,
114
+ "grad_norm": 1.8747535943984985,
115
  "learning_rate": 4.101580135440181e-05,
116
+ "loss": 4.9706,
117
  "step": 1300
118
  },
119
  {
120
  "epoch": 1.4195060164661177,
121
+ "grad_norm": 1.901370882987976,
122
  "learning_rate": 3.988713318284424e-05,
123
+ "loss": 4.9639,
124
  "step": 1400
125
  },
126
  {
127
  "epoch": 1.5208359721342621,
128
+ "grad_norm": 1.9110698699951172,
129
  "learning_rate": 3.875846501128668e-05,
130
+ "loss": 4.9622,
131
  "step": 1500
132
  },
133
  {
134
  "epoch": 1.5208359721342621,
135
+ "eval_loss": 4.870736598968506,
136
+ "eval_runtime": 61.3393,
137
+ "eval_samples_per_second": 57.206,
138
+ "eval_steps_per_second": 14.314,
139
  "step": 1500
140
  },
141
  {
142
  "epoch": 1.6221659278024065,
143
+ "grad_norm": 1.8562686443328857,
144
  "learning_rate": 3.762979683972912e-05,
145
+ "loss": 4.9633,
146
  "step": 1600
147
  },
148
  {
149
  "epoch": 1.723495883470551,
150
+ "grad_norm": 1.970841884613037,
151
  "learning_rate": 3.650112866817156e-05,
152
+ "loss": 4.9279,
153
  "step": 1700
154
  },
155
  {
156
  "epoch": 1.8248258391386953,
157
+ "grad_norm": 1.9571095705032349,
158
  "learning_rate": 3.5372460496614e-05,
159
+ "loss": 4.952,
160
  "step": 1800
161
  },
162
  {
163
  "epoch": 1.9261557948068397,
164
+ "grad_norm": 2.0035080909729004,
165
  "learning_rate": 3.424379232505643e-05,
166
+ "loss": 4.9448,
167
  "step": 1900
168
  },
169
  {
170
  "epoch": 2.0283723875870803,
171
+ "grad_norm": 1.978408694267273,
172
  "learning_rate": 3.3115124153498873e-05,
173
+ "loss": 4.9862,
174
  "step": 2000
175
  },
176
  {
177
  "epoch": 2.0283723875870803,
178
+ "eval_loss": 4.826657295227051,
179
+ "eval_runtime": 61.364,
180
+ "eval_samples_per_second": 57.183,
181
+ "eval_steps_per_second": 14.308,
182
  "step": 2000
183
  },
184
  {
185
  "epoch": 2.1297023432552247,
186
+ "grad_norm": 2.2265381813049316,
187
  "learning_rate": 3.198645598194131e-05,
188
+ "loss": 4.8837,
189
  "step": 2100
190
  },
191
  {
192
  "epoch": 2.231032298923369,
193
+ "grad_norm": 1.8263903856277466,
194
  "learning_rate": 3.085778781038375e-05,
195
+ "loss": 4.893,
196
  "step": 2200
197
  },
198
  {
199
  "epoch": 2.3323622545915135,
200
+ "grad_norm": 1.8423362970352173,
201
  "learning_rate": 2.9729119638826186e-05,
202
+ "loss": 4.9038,
203
  "step": 2300
204
  },
205
  {
206
  "epoch": 2.433692210259658,
207
+ "grad_norm": 1.9064007997512817,
208
  "learning_rate": 2.8600451467268623e-05,
209
+ "loss": 4.8971,
210
  "step": 2400
211
  },
212
  {
213
  "epoch": 2.5350221659278023,
214
+ "grad_norm": 1.8199445009231567,
215
  "learning_rate": 2.747178329571106e-05,
216
+ "loss": 4.8563,
217
  "step": 2500
218
  },
219
  {
220
  "epoch": 2.5350221659278023,
221
+ "eval_loss": 4.789968013763428,
222
+ "eval_runtime": 61.2645,
223
+ "eval_samples_per_second": 57.276,
224
+ "eval_steps_per_second": 14.331,
225
  "step": 2500
226
  },
227
  {
228
  "epoch": 2.6363521215959467,
229
+ "grad_norm": 2.2458302974700928,
230
  "learning_rate": 2.63431151241535e-05,
231
+ "loss": 4.8214,
232
  "step": 2600
233
  },
234
  {
235
  "epoch": 2.737682077264091,
236
+ "grad_norm": 1.9292908906936646,
237
  "learning_rate": 2.521444695259594e-05,
238
+ "loss": 4.8538,
239
  "step": 2700
240
  },
241
  {
242
  "epoch": 2.8390120329322355,
243
+ "grad_norm": 2.03075909614563,
244
  "learning_rate": 2.4085778781038376e-05,
245
+ "loss": 4.8307,
246
  "step": 2800
247
  },
248
  {
249
  "epoch": 2.94034198860038,
250
+ "grad_norm": 1.909643530845642,
251
  "learning_rate": 2.2957110609480814e-05,
252
+ "loss": 4.8282,
253
  "step": 2900
254
  },
255
  {
256
  "epoch": 3.0425585813806206,
257
+ "grad_norm": 1.8887925148010254,
258
  "learning_rate": 2.182844243792325e-05,
259
+ "loss": 4.8738,
260
  "step": 3000
261
  },
262
  {
263
  "epoch": 3.0425585813806206,
264
+ "eval_loss": 4.765661716461182,
265
+ "eval_runtime": 61.3376,
266
+ "eval_samples_per_second": 57.208,
267
+ "eval_steps_per_second": 14.314,
268
  "step": 3000
269
  },
270
  {
271
  "epoch": 3.143888537048765,
272
+ "grad_norm": 1.8953306674957275,
273
  "learning_rate": 2.069977426636569e-05,
274
+ "loss": 4.7451,
275
  "step": 3100
276
  },
277
  {
278
  "epoch": 3.2452184927169094,
279
+ "grad_norm": 2.1468937397003174,
280
  "learning_rate": 1.957110609480813e-05,
281
+ "loss": 4.7813,
282
  "step": 3200
283
  },
284
  {
285
  "epoch": 3.346548448385054,
286
+ "grad_norm": 1.9347341060638428,
287
  "learning_rate": 1.8442437923250567e-05,
288
+ "loss": 4.792,
289
  "step": 3300
290
  },
291
  {
292
  "epoch": 3.4478784040531982,
293
+ "grad_norm": 1.8998669385910034,
294
  "learning_rate": 1.7313769751693004e-05,
295
+ "loss": 4.7867,
296
  "step": 3400
297
  },
298
  {
299
  "epoch": 3.5492083597213426,
300
+ "grad_norm": 1.899141788482666,
301
  "learning_rate": 1.6185101580135442e-05,
302
+ "loss": 4.8095,
303
  "step": 3500
304
  },
305
  {
306
  "epoch": 3.5492083597213426,
307
+ "eval_loss": 4.752286434173584,
308
+ "eval_runtime": 61.3687,
309
+ "eval_samples_per_second": 57.179,
310
+ "eval_steps_per_second": 14.307,
311
  "step": 3500
312
  },
313
  {
314
  "epoch": 3.650538315389487,
315
+ "grad_norm": 1.9524105787277222,
316
  "learning_rate": 1.5056433408577881e-05,
317
+ "loss": 4.8354,
318
  "step": 3600
319
  },
320
  {
321
  "epoch": 3.7518682710576314,
322
+ "grad_norm": 2.0022027492523193,
323
  "learning_rate": 1.3927765237020315e-05,
324
+ "loss": 4.8232,
325
  "step": 3700
326
  },
327
  {
328
  "epoch": 3.853198226725776,
329
+ "grad_norm": 1.9039005041122437,
330
  "learning_rate": 1.2799097065462754e-05,
331
+ "loss": 4.7974,
332
  "step": 3800
333
  },
334
  {
335
  "epoch": 3.9545281823939202,
336
+ "grad_norm": 1.902718186378479,
337
  "learning_rate": 1.1670428893905193e-05,
338
+ "loss": 4.8301,
339
  "step": 3900
340
  },
341
  {
342
  "epoch": 4.0567447751741605,
343
+ "grad_norm": 2.1200180053710938,
344
  "learning_rate": 1.054176072234763e-05,
345
+ "loss": 4.8426,
346
  "step": 4000
347
  },
348
  {
349
  "epoch": 4.0567447751741605,
350
+ "eval_loss": 4.738107681274414,
351
+ "eval_runtime": 61.3616,
352
+ "eval_samples_per_second": 57.186,
353
+ "eval_steps_per_second": 14.309,
354
  "step": 4000
355
  },
356
  {
357
  "epoch": 4.158074730842305,
358
+ "grad_norm": 2.008924961090088,
359
  "learning_rate": 9.413092550790068e-06,
360
+ "loss": 4.7473,
361
  "step": 4100
362
  },
363
  {
364
  "epoch": 4.259404686510449,
365
+ "grad_norm": 1.881294846534729,
366
  "learning_rate": 8.284424379232506e-06,
367
+ "loss": 4.7348,
368
  "step": 4200
369
  },
370
  {
371
  "epoch": 4.360734642178594,
372
+ "grad_norm": 1.8294726610183716,
373
  "learning_rate": 7.155756207674943e-06,
374
+ "loss": 4.7728,
375
  "step": 4300
376
  },
377
  {
378
  "epoch": 4.462064597846738,
379
+ "grad_norm": 1.9369500875473022,
380
  "learning_rate": 6.0270880361173815e-06,
381
+ "loss": 4.7811,
382
  "step": 4400
383
  },
384
  {
385
  "epoch": 4.5633945535148825,
386
+ "grad_norm": 1.9159400463104248,
387
  "learning_rate": 4.89841986455982e-06,
388
+ "loss": 4.7478,
389
  "step": 4500
390
  },
391
  {
392
  "epoch": 4.5633945535148825,
393
+ "eval_loss": 4.731945037841797,
394
+ "eval_runtime": 61.3768,
395
+ "eval_samples_per_second": 57.171,
396
+ "eval_steps_per_second": 14.305,
397
  "step": 4500
398
  },
399
  {
400
  "epoch": 4.664724509183027,
401
+ "grad_norm": 1.9592151641845703,
402
  "learning_rate": 3.7697516930022577e-06,
403
+ "loss": 4.7561,
404
  "step": 4600
405
  },
406
  {
407
  "epoch": 4.766054464851171,
408
+ "grad_norm": 1.8825119733810425,
409
  "learning_rate": 2.6410835214446955e-06,
410
+ "loss": 4.8118,
411
  "step": 4700
412
  },
413
  {
414
  "epoch": 4.867384420519316,
415
+ "grad_norm": 2.025451898574829,
416
  "learning_rate": 1.5124153498871334e-06,
417
+ "loss": 4.7582,
418
  "step": 4800
419
  },
420
  {
421
  "epoch": 4.96871437618746,
422
+ "grad_norm": 2.0353872776031494,
423
  "learning_rate": 3.837471783295711e-07,
424
+ "loss": 4.7274,
425
  "step": 4900
426
  }
427
  ],
 
442
  "attributes": {}
443
  }
444
  },
445
+ "total_flos": 8.1592623788544e+16,
446
  "train_batch_size": 4,
447
  "trial_name": null,
448
  "trial_params": null