LuigiJoseph commited on
Commit
29a4f7d
·
verified ·
1 Parent(s): 45f105d

Delete checkpoint-8961

Browse files
checkpoint-8961/config.json DELETED
@@ -1,41 +0,0 @@
1
- {
2
- "_name_or_path": "ckartal/english-to-turkish-finetuned-model",
3
- "activation_dropout": 0.0,
4
- "activation_function": "swish",
5
- "architectures": [
6
- "MarianMTModel"
7
- ],
8
- "attention_dropout": 0.0,
9
- "bos_token_id": 0,
10
- "classifier_dropout": 0.0,
11
- "d_model": 512,
12
- "decoder_attention_heads": 8,
13
- "decoder_ffn_dim": 2048,
14
- "decoder_layerdrop": 0.0,
15
- "decoder_layers": 6,
16
- "decoder_start_token_id": 59993,
17
- "decoder_vocab_size": 59994,
18
- "dropout": 0.1,
19
- "encoder_attention_heads": 8,
20
- "encoder_ffn_dim": 2048,
21
- "encoder_layerdrop": 0.0,
22
- "encoder_layers": 6,
23
- "eos_token_id": 0,
24
- "forced_eos_token_id": 0,
25
- "init_std": 0.02,
26
- "is_encoder_decoder": true,
27
- "max_length": null,
28
- "max_position_embeddings": 512,
29
- "model_type": "marian",
30
- "normalize_embedding": false,
31
- "num_beams": null,
32
- "num_hidden_layers": 6,
33
- "pad_token_id": 59993,
34
- "scale_embedding": true,
35
- "share_encoder_decoder_embeddings": true,
36
- "static_position_embeddings": true,
37
- "torch_dtype": "float32",
38
- "transformers_version": "4.49.0",
39
- "use_cache": true,
40
- "vocab_size": 59994
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-8961/generation_config.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "bad_words_ids": [
3
- [
4
- 59993
5
- ]
6
- ],
7
- "bos_token_id": 0,
8
- "decoder_start_token_id": 59993,
9
- "eos_token_id": 0,
10
- "forced_eos_token_id": 0,
11
- "max_length": 512,
12
- "num_beams": 6,
13
- "pad_token_id": 59993,
14
- "renormalize_logits": true,
15
- "transformers_version": "4.49.0"
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-8961/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d68b65a2bd91fa1942bbd3a9736e0dfea53eee5b472c6c4609f74a7efc65371
3
- size 299690728
 
 
 
 
checkpoint-8961/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9504ade0945887bc18c4db219e8e77bfcdbad2a247e8fd1f93ab7936c7fdd1d
3
- size 599054970
 
 
 
 
checkpoint-8961/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:12de9ac1380ec85489c1c0ac2e8e97a71139b009320d67f140f242290f6b39b6
3
- size 14244
 
 
 
 
checkpoint-8961/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f62134c228b9c6de37631c7eafe659790f04722a4440f8d9c2781df03f3081a3
3
- size 988
 
 
 
 
checkpoint-8961/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6434206d5198b72fcdb872b88c17883c2c789d1dea784eef6faa3b3a8d9db9e
3
- size 1064
 
 
 
 
checkpoint-8961/source.spm DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:98eb24f0995a9d5f7cb0fb628c474628b1d2284615e881e857d062c0b651ce10
3
- size 793920
 
 
 
 
checkpoint-8961/special_tokens_map.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "eos_token": {
3
- "content": "</s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "pad_token": {
10
- "content": "<pad>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "unk_token": {
17
- "content": "<unk>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-8961/target.spm DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:45cc6000ed513cdca8f80739087fbcbf9933dc50c9ae36c319c9670882f72e1b
3
- size 837876
 
 
 
 
checkpoint-8961/tokenizer_config.json DELETED
@@ -1,40 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "</s>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<unk>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "59993": {
20
- "content": "<pad>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- }
27
- },
28
- "clean_up_tokenization_spaces": true,
29
- "eos_token": "</s>",
30
- "extra_special_tokens": {},
31
- "model_max_length": 512,
32
- "pad_token": "<pad>",
33
- "return_tensors": "pt",
34
- "separate_vocabs": false,
35
- "source_lang": "eng",
36
- "sp_model_kwargs": {},
37
- "target_lang": "tur",
38
- "tokenizer_class": "MarianTokenizer",
39
- "unk_token": "<unk>"
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-8961/trainer_state.json DELETED
@@ -1,1310 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
- "eval_steps": 500,
6
- "global_step": 8961,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.016739203213927016,
13
- "grad_norm": 0.439177542924881,
14
- "learning_rate": 4.987445597589555e-05,
15
- "loss": 1.5222,
16
- "step": 50
17
- },
18
- {
19
- "epoch": 0.03347840642785403,
20
- "grad_norm": 0.4248828887939453,
21
- "learning_rate": 4.973496261577949e-05,
22
- "loss": 0.1842,
23
- "step": 100
24
- },
25
- {
26
- "epoch": 0.05021760964178105,
27
- "grad_norm": 0.3019009232521057,
28
- "learning_rate": 4.9595469255663436e-05,
29
- "loss": 0.1471,
30
- "step": 150
31
- },
32
- {
33
- "epoch": 0.06695681285570806,
34
- "grad_norm": 0.2518245577812195,
35
- "learning_rate": 4.9455975895547376e-05,
36
- "loss": 0.1306,
37
- "step": 200
38
- },
39
- {
40
- "epoch": 0.08369601606963509,
41
- "grad_norm": 0.3660012185573578,
42
- "learning_rate": 4.931648253543131e-05,
43
- "loss": 0.1276,
44
- "step": 250
45
- },
46
- {
47
- "epoch": 0.1004352192835621,
48
- "grad_norm": 0.32854148745536804,
49
- "learning_rate": 4.917698917531526e-05,
50
- "loss": 0.1048,
51
- "step": 300
52
- },
53
- {
54
- "epoch": 0.11717442249748912,
55
- "grad_norm": 0.24879515171051025,
56
- "learning_rate": 4.90374958151992e-05,
57
- "loss": 0.1054,
58
- "step": 350
59
- },
60
- {
61
- "epoch": 0.13391362571141613,
62
- "grad_norm": 0.36416563391685486,
63
- "learning_rate": 4.889800245508314e-05,
64
- "loss": 0.0985,
65
- "step": 400
66
- },
67
- {
68
- "epoch": 0.15065282892534315,
69
- "grad_norm": 0.33641186356544495,
70
- "learning_rate": 4.875850909496708e-05,
71
- "loss": 0.1044,
72
- "step": 450
73
- },
74
- {
75
- "epoch": 0.16739203213927017,
76
- "grad_norm": 0.32909244298934937,
77
- "learning_rate": 4.861901573485103e-05,
78
- "loss": 0.1089,
79
- "step": 500
80
- },
81
- {
82
- "epoch": 0.1841312353531972,
83
- "grad_norm": 0.36060285568237305,
84
- "learning_rate": 4.847952237473497e-05,
85
- "loss": 0.09,
86
- "step": 550
87
- },
88
- {
89
- "epoch": 0.2008704385671242,
90
- "grad_norm": 0.2510785758495331,
91
- "learning_rate": 4.83400290146189e-05,
92
- "loss": 0.0884,
93
- "step": 600
94
- },
95
- {
96
- "epoch": 0.21760964178105122,
97
- "grad_norm": 0.22478719055652618,
98
- "learning_rate": 4.820053565450285e-05,
99
- "loss": 0.0866,
100
- "step": 650
101
- },
102
- {
103
- "epoch": 0.23434884499497824,
104
- "grad_norm": 0.37321263551712036,
105
- "learning_rate": 4.806104229438679e-05,
106
- "loss": 0.0884,
107
- "step": 700
108
- },
109
- {
110
- "epoch": 0.25108804820890523,
111
- "grad_norm": 0.2660929262638092,
112
- "learning_rate": 4.792154893427073e-05,
113
- "loss": 0.0819,
114
- "step": 750
115
- },
116
- {
117
- "epoch": 0.26782725142283226,
118
- "grad_norm": 0.2338525801897049,
119
- "learning_rate": 4.778205557415467e-05,
120
- "loss": 0.0845,
121
- "step": 800
122
- },
123
- {
124
- "epoch": 0.2845664546367593,
125
- "grad_norm": 0.308557391166687,
126
- "learning_rate": 4.764256221403862e-05,
127
- "loss": 0.0815,
128
- "step": 850
129
- },
130
- {
131
- "epoch": 0.3013056578506863,
132
- "grad_norm": 0.27098262310028076,
133
- "learning_rate": 4.750306885392255e-05,
134
- "loss": 0.0833,
135
- "step": 900
136
- },
137
- {
138
- "epoch": 0.3180448610646133,
139
- "grad_norm": 0.23054952919483185,
140
- "learning_rate": 4.736357549380649e-05,
141
- "loss": 0.0806,
142
- "step": 950
143
- },
144
- {
145
- "epoch": 0.33478406427854035,
146
- "grad_norm": 0.21355900168418884,
147
- "learning_rate": 4.722408213369044e-05,
148
- "loss": 0.073,
149
- "step": 1000
150
- },
151
- {
152
- "epoch": 0.3515232674924674,
153
- "grad_norm": 0.20395708084106445,
154
- "learning_rate": 4.708458877357438e-05,
155
- "loss": 0.0775,
156
- "step": 1050
157
- },
158
- {
159
- "epoch": 0.3682624707063944,
160
- "grad_norm": 0.21063613891601562,
161
- "learning_rate": 4.694509541345832e-05,
162
- "loss": 0.0789,
163
- "step": 1100
164
- },
165
- {
166
- "epoch": 0.3850016739203214,
167
- "grad_norm": 0.20589284598827362,
168
- "learning_rate": 4.680560205334226e-05,
169
- "loss": 0.0809,
170
- "step": 1150
171
- },
172
- {
173
- "epoch": 0.4017408771342484,
174
- "grad_norm": 0.27975228428840637,
175
- "learning_rate": 4.666610869322621e-05,
176
- "loss": 0.078,
177
- "step": 1200
178
- },
179
- {
180
- "epoch": 0.4184800803481754,
181
- "grad_norm": 0.2529745399951935,
182
- "learning_rate": 4.6526615333110144e-05,
183
- "loss": 0.0704,
184
- "step": 1250
185
- },
186
- {
187
- "epoch": 0.43521928356210243,
188
- "grad_norm": 0.2205154448747635,
189
- "learning_rate": 4.6387121972994084e-05,
190
- "loss": 0.0733,
191
- "step": 1300
192
- },
193
- {
194
- "epoch": 0.45195848677602946,
195
- "grad_norm": 0.2254629135131836,
196
- "learning_rate": 4.624762861287803e-05,
197
- "loss": 0.0751,
198
- "step": 1350
199
- },
200
- {
201
- "epoch": 0.4686976899899565,
202
- "grad_norm": 0.17614957690238953,
203
- "learning_rate": 4.610813525276197e-05,
204
- "loss": 0.0747,
205
- "step": 1400
206
- },
207
- {
208
- "epoch": 0.4854368932038835,
209
- "grad_norm": 0.15940478444099426,
210
- "learning_rate": 4.596864189264591e-05,
211
- "loss": 0.0698,
212
- "step": 1450
213
- },
214
- {
215
- "epoch": 0.5021760964178105,
216
- "grad_norm": 0.1869521141052246,
217
- "learning_rate": 4.5829148532529854e-05,
218
- "loss": 0.0721,
219
- "step": 1500
220
- },
221
- {
222
- "epoch": 0.5189152996317375,
223
- "grad_norm": 0.36063650250434875,
224
- "learning_rate": 4.5689655172413794e-05,
225
- "loss": 0.0706,
226
- "step": 1550
227
- },
228
- {
229
- "epoch": 0.5356545028456645,
230
- "grad_norm": 0.16967014968395233,
231
- "learning_rate": 4.5550161812297735e-05,
232
- "loss": 0.0759,
233
- "step": 1600
234
- },
235
- {
236
- "epoch": 0.5523937060595916,
237
- "grad_norm": 0.29293423891067505,
238
- "learning_rate": 4.5410668452181676e-05,
239
- "loss": 0.0711,
240
- "step": 1650
241
- },
242
- {
243
- "epoch": 0.5691329092735186,
244
- "grad_norm": 0.3034748136997223,
245
- "learning_rate": 4.527117509206562e-05,
246
- "loss": 0.067,
247
- "step": 1700
248
- },
249
- {
250
- "epoch": 0.5858721124874456,
251
- "grad_norm": 0.1974593997001648,
252
- "learning_rate": 4.513168173194956e-05,
253
- "loss": 0.0701,
254
- "step": 1750
255
- },
256
- {
257
- "epoch": 0.6026113157013726,
258
- "grad_norm": 0.18101799488067627,
259
- "learning_rate": 4.4992188371833505e-05,
260
- "loss": 0.0717,
261
- "step": 1800
262
- },
263
- {
264
- "epoch": 0.6193505189152997,
265
- "grad_norm": 0.14422941207885742,
266
- "learning_rate": 4.4852695011717445e-05,
267
- "loss": 0.0686,
268
- "step": 1850
269
- },
270
- {
271
- "epoch": 0.6360897221292267,
272
- "grad_norm": 0.28663551807403564,
273
- "learning_rate": 4.4713201651601386e-05,
274
- "loss": 0.0646,
275
- "step": 1900
276
- },
277
- {
278
- "epoch": 0.6528289253431536,
279
- "grad_norm": 0.23879379034042358,
280
- "learning_rate": 4.4573708291485327e-05,
281
- "loss": 0.0684,
282
- "step": 1950
283
- },
284
- {
285
- "epoch": 0.6695681285570807,
286
- "grad_norm": 0.21389362215995789,
287
- "learning_rate": 4.443421493136927e-05,
288
- "loss": 0.066,
289
- "step": 2000
290
- },
291
- {
292
- "epoch": 0.6863073317710077,
293
- "grad_norm": 0.26841893792152405,
294
- "learning_rate": 4.4294721571253215e-05,
295
- "loss": 0.0717,
296
- "step": 2050
297
- },
298
- {
299
- "epoch": 0.7030465349849347,
300
- "grad_norm": 0.240205317735672,
301
- "learning_rate": 4.415522821113715e-05,
302
- "loss": 0.0697,
303
- "step": 2100
304
- },
305
- {
306
- "epoch": 0.7197857381988617,
307
- "grad_norm": 0.28098127245903015,
308
- "learning_rate": 4.4015734851021096e-05,
309
- "loss": 0.0713,
310
- "step": 2150
311
- },
312
- {
313
- "epoch": 0.7365249414127888,
314
- "grad_norm": 0.23308847844600677,
315
- "learning_rate": 4.3876241490905037e-05,
316
- "loss": 0.0667,
317
- "step": 2200
318
- },
319
- {
320
- "epoch": 0.7532641446267158,
321
- "grad_norm": 0.22748568654060364,
322
- "learning_rate": 4.373674813078898e-05,
323
- "loss": 0.0605,
324
- "step": 2250
325
- },
326
- {
327
- "epoch": 0.7700033478406428,
328
- "grad_norm": 0.3932187259197235,
329
- "learning_rate": 4.359725477067292e-05,
330
- "loss": 0.0676,
331
- "step": 2300
332
- },
333
- {
334
- "epoch": 0.7867425510545698,
335
- "grad_norm": 0.23918767273426056,
336
- "learning_rate": 4.345776141055686e-05,
337
- "loss": 0.0624,
338
- "step": 2350
339
- },
340
- {
341
- "epoch": 0.8034817542684968,
342
- "grad_norm": 0.3068426549434662,
343
- "learning_rate": 4.33182680504408e-05,
344
- "loss": 0.0664,
345
- "step": 2400
346
- },
347
- {
348
- "epoch": 0.8202209574824239,
349
- "grad_norm": 0.17977873980998993,
350
- "learning_rate": 4.317877469032474e-05,
351
- "loss": 0.0726,
352
- "step": 2450
353
- },
354
- {
355
- "epoch": 0.8369601606963508,
356
- "grad_norm": 0.16876642405986786,
357
- "learning_rate": 4.303928133020869e-05,
358
- "loss": 0.0639,
359
- "step": 2500
360
- },
361
- {
362
- "epoch": 0.8536993639102779,
363
- "grad_norm": 0.17980250716209412,
364
- "learning_rate": 4.289978797009263e-05,
365
- "loss": 0.0701,
366
- "step": 2550
367
- },
368
- {
369
- "epoch": 0.8704385671242049,
370
- "grad_norm": 0.1711459904909134,
371
- "learning_rate": 4.276029460997656e-05,
372
- "loss": 0.063,
373
- "step": 2600
374
- },
375
- {
376
- "epoch": 0.8871777703381319,
377
- "grad_norm": 0.443228542804718,
378
- "learning_rate": 4.262080124986051e-05,
379
- "loss": 0.0675,
380
- "step": 2650
381
- },
382
- {
383
- "epoch": 0.9039169735520589,
384
- "grad_norm": 0.2098589390516281,
385
- "learning_rate": 4.248130788974445e-05,
386
- "loss": 0.062,
387
- "step": 2700
388
- },
389
- {
390
- "epoch": 0.920656176765986,
391
- "grad_norm": 0.3022039234638214,
392
- "learning_rate": 4.234181452962839e-05,
393
- "loss": 0.07,
394
- "step": 2750
395
- },
396
- {
397
- "epoch": 0.937395379979913,
398
- "grad_norm": 0.19368910789489746,
399
- "learning_rate": 4.220232116951233e-05,
400
- "loss": 0.0621,
401
- "step": 2800
402
- },
403
- {
404
- "epoch": 0.9541345831938399,
405
- "grad_norm": 0.18753108382225037,
406
- "learning_rate": 4.206282780939628e-05,
407
- "loss": 0.0631,
408
- "step": 2850
409
- },
410
- {
411
- "epoch": 0.970873786407767,
412
- "grad_norm": 0.15517786145210266,
413
- "learning_rate": 4.192333444928022e-05,
414
- "loss": 0.0641,
415
- "step": 2900
416
- },
417
- {
418
- "epoch": 0.987612989621694,
419
- "grad_norm": 0.11765792220830917,
420
- "learning_rate": 4.178384108916415e-05,
421
- "loss": 0.0612,
422
- "step": 2950
423
- },
424
- {
425
- "epoch": 1.0,
426
- "eval_loss": 0.05521286651492119,
427
- "eval_runtime": 50.415,
428
- "eval_samples_per_second": 236.993,
429
- "eval_steps_per_second": 14.817,
430
- "step": 2987
431
- },
432
- {
433
- "epoch": 1.004352192835621,
434
- "grad_norm": 0.2691793739795685,
435
- "learning_rate": 4.16443477290481e-05,
436
- "loss": 0.059,
437
- "step": 3000
438
- },
439
- {
440
- "epoch": 1.021091396049548,
441
- "grad_norm": 0.394694060087204,
442
- "learning_rate": 4.150485436893204e-05,
443
- "loss": 0.0566,
444
- "step": 3050
445
- },
446
- {
447
- "epoch": 1.037830599263475,
448
- "grad_norm": 0.19438503682613373,
449
- "learning_rate": 4.136536100881598e-05,
450
- "loss": 0.0591,
451
- "step": 3100
452
- },
453
- {
454
- "epoch": 1.0545698024774022,
455
- "grad_norm": 0.21350933611392975,
456
- "learning_rate": 4.122586764869992e-05,
457
- "loss": 0.0509,
458
- "step": 3150
459
- },
460
- {
461
- "epoch": 1.071309005691329,
462
- "grad_norm": 0.26747575402259827,
463
- "learning_rate": 4.108637428858387e-05,
464
- "loss": 0.0589,
465
- "step": 3200
466
- },
467
- {
468
- "epoch": 1.088048208905256,
469
- "grad_norm": 0.31256961822509766,
470
- "learning_rate": 4.0946880928467804e-05,
471
- "loss": 0.0602,
472
- "step": 3250
473
- },
474
- {
475
- "epoch": 1.1047874121191832,
476
- "grad_norm": 0.18631280958652496,
477
- "learning_rate": 4.0807387568351745e-05,
478
- "loss": 0.0547,
479
- "step": 3300
480
- },
481
- {
482
- "epoch": 1.12152661533311,
483
- "grad_norm": 0.18677473068237305,
484
- "learning_rate": 4.066789420823569e-05,
485
- "loss": 0.0543,
486
- "step": 3350
487
- },
488
- {
489
- "epoch": 1.1382658185470371,
490
- "grad_norm": 0.24535444378852844,
491
- "learning_rate": 4.052840084811963e-05,
492
- "loss": 0.0583,
493
- "step": 3400
494
- },
495
- {
496
- "epoch": 1.1550050217609642,
497
- "grad_norm": 0.1752105951309204,
498
- "learning_rate": 4.038890748800357e-05,
499
- "loss": 0.0504,
500
- "step": 3450
501
- },
502
- {
503
- "epoch": 1.1717442249748913,
504
- "grad_norm": 0.14743360877037048,
505
- "learning_rate": 4.0249414127887514e-05,
506
- "loss": 0.055,
507
- "step": 3500
508
- },
509
- {
510
- "epoch": 1.1884834281888181,
511
- "grad_norm": 0.11535945534706116,
512
- "learning_rate": 4.010992076777146e-05,
513
- "loss": 0.0552,
514
- "step": 3550
515
- },
516
- {
517
- "epoch": 1.2052226314027452,
518
- "grad_norm": 0.26563358306884766,
519
- "learning_rate": 3.9970427407655395e-05,
520
- "loss": 0.0552,
521
- "step": 3600
522
- },
523
- {
524
- "epoch": 1.2219618346166723,
525
- "grad_norm": 0.15104246139526367,
526
- "learning_rate": 3.9830934047539336e-05,
527
- "loss": 0.0575,
528
- "step": 3650
529
- },
530
- {
531
- "epoch": 1.2387010378305994,
532
- "grad_norm": 0.2198421210050583,
533
- "learning_rate": 3.9691440687423283e-05,
534
- "loss": 0.0567,
535
- "step": 3700
536
- },
537
- {
538
- "epoch": 1.2554402410445262,
539
- "grad_norm": 0.20177733898162842,
540
- "learning_rate": 3.955194732730722e-05,
541
- "loss": 0.0556,
542
- "step": 3750
543
- },
544
- {
545
- "epoch": 1.2721794442584533,
546
- "grad_norm": 0.36604830622673035,
547
- "learning_rate": 3.9412453967191165e-05,
548
- "loss": 0.0569,
549
- "step": 3800
550
- },
551
- {
552
- "epoch": 1.2889186474723804,
553
- "grad_norm": 0.18883727490901947,
554
- "learning_rate": 3.9272960607075105e-05,
555
- "loss": 0.0595,
556
- "step": 3850
557
- },
558
- {
559
- "epoch": 1.3056578506863072,
560
- "grad_norm": 0.14828617870807648,
561
- "learning_rate": 3.9133467246959046e-05,
562
- "loss": 0.0548,
563
- "step": 3900
564
- },
565
- {
566
- "epoch": 1.3223970539002343,
567
- "grad_norm": 0.19220437109470367,
568
- "learning_rate": 3.899397388684299e-05,
569
- "loss": 0.053,
570
- "step": 3950
571
- },
572
- {
573
- "epoch": 1.3391362571141614,
574
- "grad_norm": 0.16049669682979584,
575
- "learning_rate": 3.885448052672693e-05,
576
- "loss": 0.0581,
577
- "step": 4000
578
- },
579
- {
580
- "epoch": 1.3558754603280883,
581
- "grad_norm": 0.22821515798568726,
582
- "learning_rate": 3.8714987166610875e-05,
583
- "loss": 0.0518,
584
- "step": 4050
585
- },
586
- {
587
- "epoch": 1.3726146635420153,
588
- "grad_norm": 0.1879580318927765,
589
- "learning_rate": 3.857549380649481e-05,
590
- "loss": 0.0574,
591
- "step": 4100
592
- },
593
- {
594
- "epoch": 1.3893538667559424,
595
- "grad_norm": 0.16026251018047333,
596
- "learning_rate": 3.8436000446378756e-05,
597
- "loss": 0.063,
598
- "step": 4150
599
- },
600
- {
601
- "epoch": 1.4060930699698695,
602
- "grad_norm": 0.26868143677711487,
603
- "learning_rate": 3.82965070862627e-05,
604
- "loss": 0.0571,
605
- "step": 4200
606
- },
607
- {
608
- "epoch": 1.4228322731837966,
609
- "grad_norm": 0.2529687285423279,
610
- "learning_rate": 3.815701372614664e-05,
611
- "loss": 0.0528,
612
- "step": 4250
613
- },
614
- {
615
- "epoch": 1.4395714763977234,
616
- "grad_norm": 0.19138221442699432,
617
- "learning_rate": 3.801752036603058e-05,
618
- "loss": 0.0584,
619
- "step": 4300
620
- },
621
- {
622
- "epoch": 1.4563106796116505,
623
- "grad_norm": 0.16359661519527435,
624
- "learning_rate": 3.787802700591452e-05,
625
- "loss": 0.0539,
626
- "step": 4350
627
- },
628
- {
629
- "epoch": 1.4730498828255776,
630
- "grad_norm": 0.1373494267463684,
631
- "learning_rate": 3.7738533645798466e-05,
632
- "loss": 0.0557,
633
- "step": 4400
634
- },
635
- {
636
- "epoch": 1.4897890860395044,
637
- "grad_norm": 0.15695162117481232,
638
- "learning_rate": 3.75990402856824e-05,
639
- "loss": 0.0491,
640
- "step": 4450
641
- },
642
- {
643
- "epoch": 1.5065282892534315,
644
- "grad_norm": 0.18462614715099335,
645
- "learning_rate": 3.745954692556635e-05,
646
- "loss": 0.0495,
647
- "step": 4500
648
- },
649
- {
650
- "epoch": 1.5232674924673586,
651
- "grad_norm": 0.27876704931259155,
652
- "learning_rate": 3.732005356545029e-05,
653
- "loss": 0.0523,
654
- "step": 4550
655
- },
656
- {
657
- "epoch": 1.5400066956812855,
658
- "grad_norm": 0.30491840839385986,
659
- "learning_rate": 3.718056020533423e-05,
660
- "loss": 0.0564,
661
- "step": 4600
662
- },
663
- {
664
- "epoch": 1.5567458988952128,
665
- "grad_norm": 0.18721336126327515,
666
- "learning_rate": 3.704106684521817e-05,
667
- "loss": 0.0524,
668
- "step": 4650
669
- },
670
- {
671
- "epoch": 1.5734851021091396,
672
- "grad_norm": 0.21216215193271637,
673
- "learning_rate": 3.690157348510211e-05,
674
- "loss": 0.0521,
675
- "step": 4700
676
- },
677
- {
678
- "epoch": 1.5902243053230665,
679
- "grad_norm": 0.1368396282196045,
680
- "learning_rate": 3.676208012498605e-05,
681
- "loss": 0.056,
682
- "step": 4750
683
- },
684
- {
685
- "epoch": 1.6069635085369938,
686
- "grad_norm": 0.13692086935043335,
687
- "learning_rate": 3.662258676486999e-05,
688
- "loss": 0.0443,
689
- "step": 4800
690
- },
691
- {
692
- "epoch": 1.6237027117509206,
693
- "grad_norm": 0.11640128493309021,
694
- "learning_rate": 3.648309340475394e-05,
695
- "loss": 0.0488,
696
- "step": 4850
697
- },
698
- {
699
- "epoch": 1.6404419149648477,
700
- "grad_norm": 0.19953882694244385,
701
- "learning_rate": 3.634360004463788e-05,
702
- "loss": 0.0553,
703
- "step": 4900
704
- },
705
- {
706
- "epoch": 1.6571811181787748,
707
- "grad_norm": 0.1966984122991562,
708
- "learning_rate": 3.6204106684521813e-05,
709
- "loss": 0.0536,
710
- "step": 4950
711
- },
712
- {
713
- "epoch": 1.6739203213927016,
714
- "grad_norm": 0.2324533313512802,
715
- "learning_rate": 3.606461332440576e-05,
716
- "loss": 0.0493,
717
- "step": 5000
718
- },
719
- {
720
- "epoch": 1.6906595246066287,
721
- "grad_norm": 0.16217607259750366,
722
- "learning_rate": 3.59251199642897e-05,
723
- "loss": 0.0503,
724
- "step": 5050
725
- },
726
- {
727
- "epoch": 1.7073987278205558,
728
- "grad_norm": 0.23949602246284485,
729
- "learning_rate": 3.578562660417364e-05,
730
- "loss": 0.0556,
731
- "step": 5100
732
- },
733
- {
734
- "epoch": 1.7241379310344827,
735
- "grad_norm": 0.21387897431850433,
736
- "learning_rate": 3.564613324405758e-05,
737
- "loss": 0.0548,
738
- "step": 5150
739
- },
740
- {
741
- "epoch": 1.7408771342484097,
742
- "grad_norm": 0.2055111676454544,
743
- "learning_rate": 3.550663988394153e-05,
744
- "loss": 0.06,
745
- "step": 5200
746
- },
747
- {
748
- "epoch": 1.7576163374623368,
749
- "grad_norm": 0.20280921459197998,
750
- "learning_rate": 3.5367146523825464e-05,
751
- "loss": 0.0508,
752
- "step": 5250
753
- },
754
- {
755
- "epoch": 1.7743555406762637,
756
- "grad_norm": 0.14165103435516357,
757
- "learning_rate": 3.5227653163709405e-05,
758
- "loss": 0.0581,
759
- "step": 5300
760
- },
761
- {
762
- "epoch": 1.791094743890191,
763
- "grad_norm": 0.18099863827228546,
764
- "learning_rate": 3.508815980359335e-05,
765
- "loss": 0.0562,
766
- "step": 5350
767
- },
768
- {
769
- "epoch": 1.8078339471041178,
770
- "grad_norm": 0.21743184328079224,
771
- "learning_rate": 3.494866644347729e-05,
772
- "loss": 0.0498,
773
- "step": 5400
774
- },
775
- {
776
- "epoch": 1.824573150318045,
777
- "grad_norm": 0.20934534072875977,
778
- "learning_rate": 3.4809173083361234e-05,
779
- "loss": 0.0549,
780
- "step": 5450
781
- },
782
- {
783
- "epoch": 1.841312353531972,
784
- "grad_norm": 0.1582174152135849,
785
- "learning_rate": 3.4669679723245174e-05,
786
- "loss": 0.0556,
787
- "step": 5500
788
- },
789
- {
790
- "epoch": 1.8580515567458988,
791
- "grad_norm": 0.1624903827905655,
792
- "learning_rate": 3.453018636312912e-05,
793
- "loss": 0.0516,
794
- "step": 5550
795
- },
796
- {
797
- "epoch": 1.874790759959826,
798
- "grad_norm": 0.16255798935890198,
799
- "learning_rate": 3.4390693003013056e-05,
800
- "loss": 0.0542,
801
- "step": 5600
802
- },
803
- {
804
- "epoch": 1.891529963173753,
805
- "grad_norm": 0.1269742250442505,
806
- "learning_rate": 3.4251199642896996e-05,
807
- "loss": 0.0565,
808
- "step": 5650
809
- },
810
- {
811
- "epoch": 1.9082691663876798,
812
- "grad_norm": 0.15966229140758514,
813
- "learning_rate": 3.4111706282780944e-05,
814
- "loss": 0.0538,
815
- "step": 5700
816
- },
817
- {
818
- "epoch": 1.925008369601607,
819
- "grad_norm": 0.21506330370903015,
820
- "learning_rate": 3.3972212922664884e-05,
821
- "loss": 0.0505,
822
- "step": 5750
823
- },
824
- {
825
- "epoch": 1.941747572815534,
826
- "grad_norm": 0.2145415097475052,
827
- "learning_rate": 3.3832719562548825e-05,
828
- "loss": 0.0521,
829
- "step": 5800
830
- },
831
- {
832
- "epoch": 1.9584867760294609,
833
- "grad_norm": 0.10960496962070465,
834
- "learning_rate": 3.3693226202432766e-05,
835
- "loss": 0.0513,
836
- "step": 5850
837
- },
838
- {
839
- "epoch": 1.9752259792433882,
840
- "grad_norm": 0.13635843992233276,
841
- "learning_rate": 3.355373284231671e-05,
842
- "loss": 0.0499,
843
- "step": 5900
844
- },
845
- {
846
- "epoch": 1.991965182457315,
847
- "grad_norm": 0.1542210429906845,
848
- "learning_rate": 3.341423948220065e-05,
849
- "loss": 0.0556,
850
- "step": 5950
851
- },
852
- {
853
- "epoch": 2.0,
854
- "eval_loss": 0.04946442320942879,
855
- "eval_runtime": 55.6114,
856
- "eval_samples_per_second": 214.848,
857
- "eval_steps_per_second": 13.432,
858
- "step": 5974
859
- },
860
- {
861
- "epoch": 2.008704385671242,
862
- "grad_norm": 0.1718842089176178,
863
- "learning_rate": 3.327474612208459e-05,
864
- "loss": 0.0503,
865
- "step": 6000
866
- },
867
- {
868
- "epoch": 2.025443588885169,
869
- "grad_norm": 0.1528020203113556,
870
- "learning_rate": 3.3135252761968535e-05,
871
- "loss": 0.0479,
872
- "step": 6050
873
- },
874
- {
875
- "epoch": 2.042182792099096,
876
- "grad_norm": 0.19148772954940796,
877
- "learning_rate": 3.299575940185247e-05,
878
- "loss": 0.0443,
879
- "step": 6100
880
- },
881
- {
882
- "epoch": 2.058921995313023,
883
- "grad_norm": 0.18125496804714203,
884
- "learning_rate": 3.2856266041736416e-05,
885
- "loss": 0.0473,
886
- "step": 6150
887
- },
888
- {
889
- "epoch": 2.07566119852695,
890
- "grad_norm": 0.20772996544837952,
891
- "learning_rate": 3.271677268162036e-05,
892
- "loss": 0.0539,
893
- "step": 6200
894
- },
895
- {
896
- "epoch": 2.092400401740877,
897
- "grad_norm": 0.2518468201160431,
898
- "learning_rate": 3.25772793215043e-05,
899
- "loss": 0.0468,
900
- "step": 6250
901
- },
902
- {
903
- "epoch": 2.1091396049548043,
904
- "grad_norm": 0.1350301206111908,
905
- "learning_rate": 3.243778596138824e-05,
906
- "loss": 0.0431,
907
- "step": 6300
908
- },
909
- {
910
- "epoch": 2.125878808168731,
911
- "grad_norm": 0.19141735136508942,
912
- "learning_rate": 3.229829260127218e-05,
913
- "loss": 0.048,
914
- "step": 6350
915
- },
916
- {
917
- "epoch": 2.142618011382658,
918
- "grad_norm": 0.2404586374759674,
919
- "learning_rate": 3.2158799241156126e-05,
920
- "loss": 0.0516,
921
- "step": 6400
922
- },
923
- {
924
- "epoch": 2.1593572145965854,
925
- "grad_norm": 0.21710112690925598,
926
- "learning_rate": 3.201930588104006e-05,
927
- "loss": 0.0472,
928
- "step": 6450
929
- },
930
- {
931
- "epoch": 2.176096417810512,
932
- "grad_norm": 0.14395031332969666,
933
- "learning_rate": 3.187981252092401e-05,
934
- "loss": 0.0438,
935
- "step": 6500
936
- },
937
- {
938
- "epoch": 2.192835621024439,
939
- "grad_norm": 0.20882932841777802,
940
- "learning_rate": 3.174031916080795e-05,
941
- "loss": 0.0488,
942
- "step": 6550
943
- },
944
- {
945
- "epoch": 2.2095748242383664,
946
- "grad_norm": 0.13824905455112457,
947
- "learning_rate": 3.160082580069189e-05,
948
- "loss": 0.0464,
949
- "step": 6600
950
- },
951
- {
952
- "epoch": 2.2263140274522932,
953
- "grad_norm": 0.1783577799797058,
954
- "learning_rate": 3.146133244057583e-05,
955
- "loss": 0.0459,
956
- "step": 6650
957
- },
958
- {
959
- "epoch": 2.24305323066622,
960
- "grad_norm": 0.22147531807422638,
961
- "learning_rate": 3.132183908045977e-05,
962
- "loss": 0.0476,
963
- "step": 6700
964
- },
965
- {
966
- "epoch": 2.2597924338801474,
967
- "grad_norm": 0.17393821477890015,
968
- "learning_rate": 3.118234572034371e-05,
969
- "loss": 0.0436,
970
- "step": 6750
971
- },
972
- {
973
- "epoch": 2.2765316370940742,
974
- "grad_norm": 0.15850785374641418,
975
- "learning_rate": 3.104285236022765e-05,
976
- "loss": 0.0476,
977
- "step": 6800
978
- },
979
- {
980
- "epoch": 2.2932708403080015,
981
- "grad_norm": 0.16232182085514069,
982
- "learning_rate": 3.09033590001116e-05,
983
- "loss": 0.0473,
984
- "step": 6850
985
- },
986
- {
987
- "epoch": 2.3100100435219284,
988
- "grad_norm": 0.1816001981496811,
989
- "learning_rate": 3.076386563999554e-05,
990
- "loss": 0.0427,
991
- "step": 6900
992
- },
993
- {
994
- "epoch": 2.3267492467358553,
995
- "grad_norm": 0.13417834043502808,
996
- "learning_rate": 3.062437227987948e-05,
997
- "loss": 0.0448,
998
- "step": 6950
999
- },
1000
- {
1001
- "epoch": 2.3434884499497826,
1002
- "grad_norm": 0.12576530873775482,
1003
- "learning_rate": 3.048487891976342e-05,
1004
- "loss": 0.0453,
1005
- "step": 7000
1006
- },
1007
- {
1008
- "epoch": 2.3602276531637094,
1009
- "grad_norm": 0.33120718598365784,
1010
- "learning_rate": 3.0345385559647362e-05,
1011
- "loss": 0.0462,
1012
- "step": 7050
1013
- },
1014
- {
1015
- "epoch": 2.3769668563776363,
1016
- "grad_norm": 0.22310969233512878,
1017
- "learning_rate": 3.0205892199531306e-05,
1018
- "loss": 0.0475,
1019
- "step": 7100
1020
- },
1021
- {
1022
- "epoch": 2.3937060595915636,
1023
- "grad_norm": 0.18150626122951508,
1024
- "learning_rate": 3.0066398839415243e-05,
1025
- "loss": 0.0489,
1026
- "step": 7150
1027
- },
1028
- {
1029
- "epoch": 2.4104452628054904,
1030
- "grad_norm": 0.28730452060699463,
1031
- "learning_rate": 2.9926905479299187e-05,
1032
- "loss": 0.0536,
1033
- "step": 7200
1034
- },
1035
- {
1036
- "epoch": 2.4271844660194173,
1037
- "grad_norm": 0.1918480098247528,
1038
- "learning_rate": 2.9787412119183128e-05,
1039
- "loss": 0.0426,
1040
- "step": 7250
1041
- },
1042
- {
1043
- "epoch": 2.4439236692333446,
1044
- "grad_norm": 0.16158398985862732,
1045
- "learning_rate": 2.964791875906707e-05,
1046
- "loss": 0.0458,
1047
- "step": 7300
1048
- },
1049
- {
1050
- "epoch": 2.4606628724472714,
1051
- "grad_norm": 0.27141231298446655,
1052
- "learning_rate": 2.9508425398951012e-05,
1053
- "loss": 0.0454,
1054
- "step": 7350
1055
- },
1056
- {
1057
- "epoch": 2.4774020756611987,
1058
- "grad_norm": 0.1777345836162567,
1059
- "learning_rate": 2.936893203883495e-05,
1060
- "loss": 0.0435,
1061
- "step": 7400
1062
- },
1063
- {
1064
- "epoch": 2.4941412788751256,
1065
- "grad_norm": 0.14735421538352966,
1066
- "learning_rate": 2.9229438678718897e-05,
1067
- "loss": 0.0489,
1068
- "step": 7450
1069
- },
1070
- {
1071
- "epoch": 2.5108804820890525,
1072
- "grad_norm": 0.1486055999994278,
1073
- "learning_rate": 2.9089945318602834e-05,
1074
- "loss": 0.0477,
1075
- "step": 7500
1076
- },
1077
- {
1078
- "epoch": 2.5276196853029793,
1079
- "grad_norm": 0.17078754305839539,
1080
- "learning_rate": 2.895045195848678e-05,
1081
- "loss": 0.0444,
1082
- "step": 7550
1083
- },
1084
- {
1085
- "epoch": 2.5443588885169066,
1086
- "grad_norm": 0.19276435673236847,
1087
- "learning_rate": 2.881095859837072e-05,
1088
- "loss": 0.0486,
1089
- "step": 7600
1090
- },
1091
- {
1092
- "epoch": 2.5610980917308335,
1093
- "grad_norm": 0.21209606528282166,
1094
- "learning_rate": 2.8671465238254656e-05,
1095
- "loss": 0.0497,
1096
- "step": 7650
1097
- },
1098
- {
1099
- "epoch": 2.5778372949447608,
1100
- "grad_norm": 0.21018877625465393,
1101
- "learning_rate": 2.8531971878138604e-05,
1102
- "loss": 0.0441,
1103
- "step": 7700
1104
- },
1105
- {
1106
- "epoch": 2.5945764981586876,
1107
- "grad_norm": 0.15666617453098297,
1108
- "learning_rate": 2.839247851802254e-05,
1109
- "loss": 0.0467,
1110
- "step": 7750
1111
- },
1112
- {
1113
- "epoch": 2.6113157013726145,
1114
- "grad_norm": 0.1940685212612152,
1115
- "learning_rate": 2.8252985157906485e-05,
1116
- "loss": 0.0523,
1117
- "step": 7800
1118
- },
1119
- {
1120
- "epoch": 2.628054904586542,
1121
- "grad_norm": 0.28480586409568787,
1122
- "learning_rate": 2.8113491797790426e-05,
1123
- "loss": 0.0481,
1124
- "step": 7850
1125
- },
1126
- {
1127
- "epoch": 2.6447941078004686,
1128
- "grad_norm": 0.2223973125219345,
1129
- "learning_rate": 2.797399843767437e-05,
1130
- "loss": 0.0432,
1131
- "step": 7900
1132
- },
1133
- {
1134
- "epoch": 2.661533311014396,
1135
- "grad_norm": 0.15986157953739166,
1136
- "learning_rate": 2.783450507755831e-05,
1137
- "loss": 0.0454,
1138
- "step": 7950
1139
- },
1140
- {
1141
- "epoch": 2.678272514228323,
1142
- "grad_norm": 0.1384258270263672,
1143
- "learning_rate": 2.7695011717442248e-05,
1144
- "loss": 0.0477,
1145
- "step": 8000
1146
- },
1147
- {
1148
- "epoch": 2.6950117174422497,
1149
- "grad_norm": 0.1721869707107544,
1150
- "learning_rate": 2.7555518357326192e-05,
1151
- "loss": 0.0453,
1152
- "step": 8050
1153
- },
1154
- {
1155
- "epoch": 2.7117509206561765,
1156
- "grad_norm": 0.20737840235233307,
1157
- "learning_rate": 2.7416024997210132e-05,
1158
- "loss": 0.0504,
1159
- "step": 8100
1160
- },
1161
- {
1162
- "epoch": 2.728490123870104,
1163
- "grad_norm": 0.18823584914207458,
1164
- "learning_rate": 2.7276531637094077e-05,
1165
- "loss": 0.0453,
1166
- "step": 8150
1167
- },
1168
- {
1169
- "epoch": 2.7452293270840307,
1170
- "grad_norm": 0.13201962411403656,
1171
- "learning_rate": 2.7137038276978017e-05,
1172
- "loss": 0.0433,
1173
- "step": 8200
1174
- },
1175
- {
1176
- "epoch": 2.761968530297958,
1177
- "grad_norm": 0.1443973183631897,
1178
- "learning_rate": 2.699754491686196e-05,
1179
- "loss": 0.0486,
1180
- "step": 8250
1181
- },
1182
- {
1183
- "epoch": 2.778707733511885,
1184
- "grad_norm": 0.29314514994621277,
1185
- "learning_rate": 2.68580515567459e-05,
1186
- "loss": 0.05,
1187
- "step": 8300
1188
- },
1189
- {
1190
- "epoch": 2.7954469367258117,
1191
- "grad_norm": 0.14852124452590942,
1192
- "learning_rate": 2.671855819662984e-05,
1193
- "loss": 0.0495,
1194
- "step": 8350
1195
- },
1196
- {
1197
- "epoch": 2.812186139939739,
1198
- "grad_norm": 0.19024662673473358,
1199
- "learning_rate": 2.6579064836513783e-05,
1200
- "loss": 0.0508,
1201
- "step": 8400
1202
- },
1203
- {
1204
- "epoch": 2.828925343153666,
1205
- "grad_norm": 0.1745578795671463,
1206
- "learning_rate": 2.6439571476397724e-05,
1207
- "loss": 0.0443,
1208
- "step": 8450
1209
- },
1210
- {
1211
- "epoch": 2.845664546367593,
1212
- "grad_norm": 0.18390017747879028,
1213
- "learning_rate": 2.6300078116281668e-05,
1214
- "loss": 0.0468,
1215
- "step": 8500
1216
- },
1217
- {
1218
- "epoch": 2.86240374958152,
1219
- "grad_norm": 0.22483347356319427,
1220
- "learning_rate": 2.616058475616561e-05,
1221
- "loss": 0.0467,
1222
- "step": 8550
1223
- },
1224
- {
1225
- "epoch": 2.879142952795447,
1226
- "grad_norm": 0.18160563707351685,
1227
- "learning_rate": 2.6021091396049553e-05,
1228
- "loss": 0.0441,
1229
- "step": 8600
1230
- },
1231
- {
1232
- "epoch": 2.8958821560093737,
1233
- "grad_norm": 0.13408955931663513,
1234
- "learning_rate": 2.588159803593349e-05,
1235
- "loss": 0.0446,
1236
- "step": 8650
1237
- },
1238
- {
1239
- "epoch": 2.912621359223301,
1240
- "grad_norm": 0.16038326919078827,
1241
- "learning_rate": 2.574210467581743e-05,
1242
- "loss": 0.0456,
1243
- "step": 8700
1244
- },
1245
- {
1246
- "epoch": 2.929360562437228,
1247
- "grad_norm": 0.22738413512706757,
1248
- "learning_rate": 2.5602611315701375e-05,
1249
- "loss": 0.0479,
1250
- "step": 8750
1251
- },
1252
- {
1253
- "epoch": 2.946099765651155,
1254
- "grad_norm": 0.20327210426330566,
1255
- "learning_rate": 2.5463117955585315e-05,
1256
- "loss": 0.0511,
1257
- "step": 8800
1258
- },
1259
- {
1260
- "epoch": 2.962838968865082,
1261
- "grad_norm": 0.15756353735923767,
1262
- "learning_rate": 2.532362459546926e-05,
1263
- "loss": 0.0426,
1264
- "step": 8850
1265
- },
1266
- {
1267
- "epoch": 2.979578172079009,
1268
- "grad_norm": 0.1305045783519745,
1269
- "learning_rate": 2.5184131235353197e-05,
1270
- "loss": 0.0442,
1271
- "step": 8900
1272
- },
1273
- {
1274
- "epoch": 2.996317375292936,
1275
- "grad_norm": 0.1610562801361084,
1276
- "learning_rate": 2.5044637875237144e-05,
1277
- "loss": 0.0467,
1278
- "step": 8950
1279
- },
1280
- {
1281
- "epoch": 3.0,
1282
- "eval_loss": 0.04702676460146904,
1283
- "eval_runtime": 52.8989,
1284
- "eval_samples_per_second": 225.865,
1285
- "eval_steps_per_second": 14.121,
1286
- "step": 8961
1287
- }
1288
- ],
1289
- "logging_steps": 50,
1290
- "max_steps": 17922,
1291
- "num_input_tokens_seen": 0,
1292
- "num_train_epochs": 6,
1293
- "save_steps": 500,
1294
- "stateful_callbacks": {
1295
- "TrainerControl": {
1296
- "args": {
1297
- "should_epoch_stop": false,
1298
- "should_evaluate": false,
1299
- "should_log": false,
1300
- "should_save": true,
1301
- "should_training_stop": false
1302
- },
1303
- "attributes": {}
1304
- }
1305
- },
1306
- "total_flos": 1.944084787966771e+16,
1307
- "train_batch_size": 16,
1308
- "trial_name": null,
1309
- "trial_params": null
1310
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-8961/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:37dd546b69fb60d8deb15a8b88e40b23e367c0e9f5a053ea3ae7c730b3874f2e
3
- size 5304
 
 
 
 
checkpoint-8961/vocab.json DELETED
The diff for this file is too large to render. See raw diff