tyzhu commited on
Commit
cdedf7a
·
verified ·
1 Parent(s): af90c5f

Training in progress, epoch 9, checkpoint

Browse files
checkpoint-1179/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 50256
3
+ }
checkpoint-1179/config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2-xl",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "do_sample": true,
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "max_length": 50,
15
+ "model_type": "gpt2",
16
+ "n_ctx": 1024,
17
+ "n_embd": 1600,
18
+ "n_head": 25,
19
+ "n_inner": null,
20
+ "n_layer": 48,
21
+ "n_positions": 1024,
22
+ "output_past": true,
23
+ "reorder_and_upcast_attn": false,
24
+ "resid_pdrop": 0.1,
25
+ "scale_attn_by_inverse_layer_idx": false,
26
+ "scale_attn_weights": true,
27
+ "summary_activation": null,
28
+ "summary_first_dropout": 0.1,
29
+ "summary_proj_to_labels": true,
30
+ "summary_type": "cls_index",
31
+ "summary_use_proj": true,
32
+ "task_specific_params": {
33
+ "text-generation": {
34
+ "do_sample": true,
35
+ "max_length": 50
36
+ }
37
+ },
38
+ "torch_dtype": "float32",
39
+ "transformers_version": "4.34.0",
40
+ "use_cache": true,
41
+ "vocab_size": 50257
42
+ }
checkpoint-1179/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "do_sample": true,
5
+ "eos_token_id": 50256,
6
+ "max_length": 50,
7
+ "transformers_version": "4.34.0"
8
+ }
checkpoint-1179/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1179/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c00c409e414ba8bfd0de184b01885de203c30ccb66684be9ea107e09824f97a4
3
+ size 12461385454
checkpoint-1179/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d540c198aeed6709c9c422e150e88c3ad5a9dd48c432064a6ae447ea4f13c79
3
+ size 6230637102
checkpoint-1179/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91331de4a996849edb488c72af532dc366b219ef9e393ae825630f2696519739
3
+ size 14244
checkpoint-1179/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e98453e4059c93abdbfc0047ac1669310799b1753d0ef96bbc94280598ab0d6a
3
+ size 1064
checkpoint-1179/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-1179/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1179/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "additional_special_tokens": [],
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": true,
16
+ "eos_token": "<|endoftext|>",
17
+ "model_max_length": 1024,
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
checkpoint-1179/trainer_state.json ADDED
@@ -0,0 +1,846 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.0,
5
+ "eval_steps": 66,
6
+ "global_step": 1179,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.11,
13
+ "learning_rate": 3e-05,
14
+ "loss": 3.7331,
15
+ "step": 14
16
+ },
17
+ {
18
+ "epoch": 0.21,
19
+ "learning_rate": 3e-05,
20
+ "loss": 2.345,
21
+ "step": 28
22
+ },
23
+ {
24
+ "epoch": 0.32,
25
+ "learning_rate": 3e-05,
26
+ "loss": 2.1491,
27
+ "step": 42
28
+ },
29
+ {
30
+ "epoch": 0.43,
31
+ "learning_rate": 3e-05,
32
+ "loss": 2.0739,
33
+ "step": 56
34
+ },
35
+ {
36
+ "epoch": 0.5,
37
+ "eval_accuracy": 0.6206453178068898,
38
+ "eval_loss": 1.8404711484909058,
39
+ "eval_runtime": 11.9883,
40
+ "eval_samples_per_second": 25.024,
41
+ "eval_steps_per_second": 1.585,
42
+ "step": 66
43
+ },
44
+ {
45
+ "epoch": 0.5,
46
+ "eval_exact_match": 7.666666666666667,
47
+ "eval_f1": 10.221428571428572,
48
+ "eval_qa_bleu": 1.4042262195131967,
49
+ "eval_qa_exact_match": 0.07333333333333333,
50
+ "eval_recite_bleu": 8.5956480576491,
51
+ "eval_recite_exact_match": 0.0,
52
+ "step": 66
53
+ },
54
+ {
55
+ "epoch": 0.53,
56
+ "learning_rate": 3e-05,
57
+ "loss": 1.9722,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.64,
62
+ "learning_rate": 3e-05,
63
+ "loss": 2.014,
64
+ "step": 84
65
+ },
66
+ {
67
+ "epoch": 0.75,
68
+ "learning_rate": 3e-05,
69
+ "loss": 1.9812,
70
+ "step": 98
71
+ },
72
+ {
73
+ "epoch": 0.85,
74
+ "learning_rate": 3e-05,
75
+ "loss": 1.9213,
76
+ "step": 112
77
+ },
78
+ {
79
+ "epoch": 0.96,
80
+ "learning_rate": 3e-05,
81
+ "loss": 1.8806,
82
+ "step": 126
83
+ },
84
+ {
85
+ "epoch": 1.01,
86
+ "eval_accuracy": 0.6365065502183406,
87
+ "eval_loss": 1.574863314628601,
88
+ "eval_runtime": 9.1867,
89
+ "eval_samples_per_second": 32.656,
90
+ "eval_steps_per_second": 2.068,
91
+ "step": 132
92
+ },
93
+ {
94
+ "epoch": 1.01,
95
+ "eval_exact_match": 8.666666666666666,
96
+ "eval_f1": 13.8123783922171,
97
+ "eval_qa_bleu": 3.0725240037081307,
98
+ "eval_qa_exact_match": 0.08666666666666667,
99
+ "eval_recite_bleu": 16.386073781113847,
100
+ "eval_recite_exact_match": 0.0,
101
+ "step": 132
102
+ },
103
+ {
104
+ "epoch": 1.07,
105
+ "learning_rate": 3e-05,
106
+ "loss": 1.6177,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 1.18,
111
+ "learning_rate": 3e-05,
112
+ "loss": 1.4286,
113
+ "step": 154
114
+ },
115
+ {
116
+ "epoch": 1.28,
117
+ "learning_rate": 3e-05,
118
+ "loss": 1.4102,
119
+ "step": 168
120
+ },
121
+ {
122
+ "epoch": 1.39,
123
+ "learning_rate": 3e-05,
124
+ "loss": 1.4033,
125
+ "step": 182
126
+ },
127
+ {
128
+ "epoch": 1.5,
129
+ "learning_rate": 3e-05,
130
+ "loss": 1.3619,
131
+ "step": 196
132
+ },
133
+ {
134
+ "epoch": 1.51,
135
+ "eval_accuracy": 0.6533139252789908,
136
+ "eval_loss": 1.342494010925293,
137
+ "eval_runtime": 9.3165,
138
+ "eval_samples_per_second": 32.201,
139
+ "eval_steps_per_second": 2.039,
140
+ "step": 198
141
+ },
142
+ {
143
+ "epoch": 1.51,
144
+ "eval_exact_match": 7.0,
145
+ "eval_f1": 10.38888888888889,
146
+ "eval_qa_bleu": 5.590724094958645,
147
+ "eval_qa_exact_match": 0.06666666666666667,
148
+ "eval_recite_bleu": 17.070052559724132,
149
+ "eval_recite_exact_match": 0.0,
150
+ "step": 198
151
+ },
152
+ {
153
+ "epoch": 1.6,
154
+ "learning_rate": 3e-05,
155
+ "loss": 1.3649,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 1.71,
160
+ "learning_rate": 3e-05,
161
+ "loss": 1.2886,
162
+ "step": 224
163
+ },
164
+ {
165
+ "epoch": 1.82,
166
+ "learning_rate": 3e-05,
167
+ "loss": 1.2787,
168
+ "step": 238
169
+ },
170
+ {
171
+ "epoch": 1.92,
172
+ "learning_rate": 3e-05,
173
+ "loss": 1.283,
174
+ "step": 252
175
+ },
176
+ {
177
+ "epoch": 2.02,
178
+ "eval_accuracy": 0.6685152838427948,
179
+ "eval_loss": 1.1253471374511719,
180
+ "eval_runtime": 9.3443,
181
+ "eval_samples_per_second": 32.105,
182
+ "eval_steps_per_second": 2.033,
183
+ "step": 264
184
+ },
185
+ {
186
+ "epoch": 2.02,
187
+ "eval_exact_match": 9.0,
188
+ "eval_f1": 13.042195767195768,
189
+ "eval_qa_bleu": 5.743589453455261,
190
+ "eval_qa_exact_match": 0.08666666666666667,
191
+ "eval_recite_bleu": 15.4059437725803,
192
+ "eval_recite_exact_match": 0.0,
193
+ "step": 264
194
+ },
195
+ {
196
+ "epoch": 2.03,
197
+ "learning_rate": 3e-05,
198
+ "loss": 1.112,
199
+ "step": 266
200
+ },
201
+ {
202
+ "epoch": 2.14,
203
+ "learning_rate": 3e-05,
204
+ "loss": 0.8377,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 2.24,
209
+ "learning_rate": 3e-05,
210
+ "loss": 0.8533,
211
+ "step": 294
212
+ },
213
+ {
214
+ "epoch": 2.35,
215
+ "learning_rate": 3e-05,
216
+ "loss": 0.8407,
217
+ "step": 308
218
+ },
219
+ {
220
+ "epoch": 2.46,
221
+ "learning_rate": 3e-05,
222
+ "loss": 0.8433,
223
+ "step": 322
224
+ },
225
+ {
226
+ "epoch": 2.52,
227
+ "eval_accuracy": 0.6825278990781174,
228
+ "eval_loss": 0.9735248684883118,
229
+ "eval_runtime": 9.3401,
230
+ "eval_samples_per_second": 32.12,
231
+ "eval_steps_per_second": 2.034,
232
+ "step": 330
233
+ },
234
+ {
235
+ "epoch": 2.52,
236
+ "eval_exact_match": 11.666666666666666,
237
+ "eval_f1": 15.95859788359788,
238
+ "eval_qa_bleu": 6.31601664322449,
239
+ "eval_qa_exact_match": 0.11,
240
+ "eval_recite_bleu": 21.285839887136575,
241
+ "eval_recite_exact_match": 0.0,
242
+ "step": 330
243
+ },
244
+ {
245
+ "epoch": 2.56,
246
+ "learning_rate": 3e-05,
247
+ "loss": 0.8632,
248
+ "step": 336
249
+ },
250
+ {
251
+ "epoch": 2.67,
252
+ "learning_rate": 3e-05,
253
+ "loss": 0.8308,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 2.78,
258
+ "learning_rate": 3e-05,
259
+ "loss": 0.8475,
260
+ "step": 364
261
+ },
262
+ {
263
+ "epoch": 2.89,
264
+ "learning_rate": 3e-05,
265
+ "loss": 0.7829,
266
+ "step": 378
267
+ },
268
+ {
269
+ "epoch": 2.99,
270
+ "learning_rate": 3e-05,
271
+ "loss": 0.7629,
272
+ "step": 392
273
+ },
274
+ {
275
+ "epoch": 3.02,
276
+ "eval_accuracy": 0.6982241630276564,
277
+ "eval_loss": 0.7873561382293701,
278
+ "eval_runtime": 9.0856,
279
+ "eval_samples_per_second": 33.019,
280
+ "eval_steps_per_second": 2.091,
281
+ "step": 396
282
+ },
283
+ {
284
+ "epoch": 3.02,
285
+ "eval_exact_match": 12.333333333333334,
286
+ "eval_f1": 16.93478835978836,
287
+ "eval_qa_bleu": 7.7589360856745815,
288
+ "eval_qa_exact_match": 0.12,
289
+ "eval_recite_bleu": 22.16978951972916,
290
+ "eval_recite_exact_match": 0.0,
291
+ "step": 396
292
+ },
293
+ {
294
+ "epoch": 3.1,
295
+ "learning_rate": 3e-05,
296
+ "loss": 0.5537,
297
+ "step": 406
298
+ },
299
+ {
300
+ "epoch": 3.21,
301
+ "learning_rate": 3e-05,
302
+ "loss": 0.5111,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 3.31,
307
+ "learning_rate": 3e-05,
308
+ "loss": 0.479,
309
+ "step": 434
310
+ },
311
+ {
312
+ "epoch": 3.42,
313
+ "learning_rate": 3e-05,
314
+ "loss": 0.4904,
315
+ "step": 448
316
+ },
317
+ {
318
+ "epoch": 3.53,
319
+ "learning_rate": 3e-05,
320
+ "loss": 0.5058,
321
+ "step": 462
322
+ },
323
+ {
324
+ "epoch": 3.53,
325
+ "eval_accuracy": 0.7086026200873362,
326
+ "eval_loss": 0.6920613646507263,
327
+ "eval_runtime": 9.3473,
328
+ "eval_samples_per_second": 32.095,
329
+ "eval_steps_per_second": 2.033,
330
+ "step": 462
331
+ },
332
+ {
333
+ "epoch": 3.53,
334
+ "eval_exact_match": 15.333333333333334,
335
+ "eval_f1": 20.8812384782973,
336
+ "eval_qa_bleu": 11.001436952878551,
337
+ "eval_qa_exact_match": 0.14,
338
+ "eval_recite_bleu": 25.319328021621963,
339
+ "eval_recite_exact_match": 0.0,
340
+ "step": 462
341
+ },
342
+ {
343
+ "epoch": 3.63,
344
+ "learning_rate": 3e-05,
345
+ "loss": 0.5456,
346
+ "step": 476
347
+ },
348
+ {
349
+ "epoch": 3.74,
350
+ "learning_rate": 3e-05,
351
+ "loss": 0.4998,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 3.85,
356
+ "learning_rate": 3e-05,
357
+ "loss": 0.5075,
358
+ "step": 504
359
+ },
360
+ {
361
+ "epoch": 3.95,
362
+ "learning_rate": 3e-05,
363
+ "loss": 0.4593,
364
+ "step": 518
365
+ },
366
+ {
367
+ "epoch": 4.03,
368
+ "eval_accuracy": 0.7196943231441048,
369
+ "eval_loss": 0.564062237739563,
370
+ "eval_runtime": 9.2784,
371
+ "eval_samples_per_second": 32.333,
372
+ "eval_steps_per_second": 2.048,
373
+ "step": 528
374
+ },
375
+ {
376
+ "epoch": 4.03,
377
+ "eval_exact_match": 13.333333333333334,
378
+ "eval_f1": 18.11966301672184,
379
+ "eval_qa_bleu": 4.47266646700781,
380
+ "eval_qa_exact_match": 0.12666666666666668,
381
+ "eval_recite_bleu": 26.136161725029392,
382
+ "eval_recite_exact_match": 0.0033333333333333335,
383
+ "step": 528
384
+ },
385
+ {
386
+ "epoch": 4.06,
387
+ "learning_rate": 3e-05,
388
+ "loss": 0.3952,
389
+ "step": 532
390
+ },
391
+ {
392
+ "epoch": 4.17,
393
+ "learning_rate": 3e-05,
394
+ "loss": 0.2996,
395
+ "step": 546
396
+ },
397
+ {
398
+ "epoch": 4.27,
399
+ "learning_rate": 3e-05,
400
+ "loss": 0.3354,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 4.38,
405
+ "learning_rate": 3e-05,
406
+ "loss": 0.3179,
407
+ "step": 574
408
+ },
409
+ {
410
+ "epoch": 4.49,
411
+ "learning_rate": 3e-05,
412
+ "loss": 0.3064,
413
+ "step": 588
414
+ },
415
+ {
416
+ "epoch": 4.53,
417
+ "eval_accuracy": 0.7245269286754003,
418
+ "eval_loss": 0.5348330736160278,
419
+ "eval_runtime": 9.5519,
420
+ "eval_samples_per_second": 31.407,
421
+ "eval_steps_per_second": 1.989,
422
+ "step": 594
423
+ },
424
+ {
425
+ "epoch": 4.53,
426
+ "eval_exact_match": 13.666666666666666,
427
+ "eval_f1": 19.689682539682536,
428
+ "eval_qa_bleu": 4.45260487005976,
429
+ "eval_qa_exact_match": 0.12666666666666668,
430
+ "eval_recite_bleu": 28.855187002245795,
431
+ "eval_recite_exact_match": 0.0,
432
+ "step": 594
433
+ },
434
+ {
435
+ "epoch": 4.6,
436
+ "learning_rate": 3e-05,
437
+ "loss": 0.3258,
438
+ "step": 602
439
+ },
440
+ {
441
+ "epoch": 4.7,
442
+ "learning_rate": 3e-05,
443
+ "loss": 0.3384,
444
+ "step": 616
445
+ },
446
+ {
447
+ "epoch": 4.81,
448
+ "learning_rate": 3e-05,
449
+ "loss": 0.3258,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 4.92,
454
+ "learning_rate": 3e-05,
455
+ "loss": 0.312,
456
+ "step": 644
457
+ },
458
+ {
459
+ "epoch": 5.02,
460
+ "learning_rate": 3e-05,
461
+ "loss": 0.2967,
462
+ "step": 658
463
+ },
464
+ {
465
+ "epoch": 5.04,
466
+ "eval_accuracy": 0.7303784570596797,
467
+ "eval_loss": 0.47703343629837036,
468
+ "eval_runtime": 9.4774,
469
+ "eval_samples_per_second": 31.654,
470
+ "eval_steps_per_second": 2.005,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 5.04,
475
+ "eval_exact_match": 12.333333333333334,
476
+ "eval_f1": 18.61798941798941,
477
+ "eval_qa_bleu": 6.60839076276961,
478
+ "eval_qa_exact_match": 0.12,
479
+ "eval_recite_bleu": 29.5293953590396,
480
+ "eval_recite_exact_match": 0.0,
481
+ "step": 660
482
+ },
483
+ {
484
+ "epoch": 5.13,
485
+ "learning_rate": 3e-05,
486
+ "loss": 0.2148,
487
+ "step": 672
488
+ },
489
+ {
490
+ "epoch": 5.24,
491
+ "learning_rate": 3e-05,
492
+ "loss": 0.2275,
493
+ "step": 686
494
+ },
495
+ {
496
+ "epoch": 5.34,
497
+ "learning_rate": 3e-05,
498
+ "loss": 0.2158,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 5.45,
503
+ "learning_rate": 3e-05,
504
+ "loss": 0.2167,
505
+ "step": 714
506
+ },
507
+ {
508
+ "epoch": 5.54,
509
+ "eval_accuracy": 0.732372634643377,
510
+ "eval_loss": 0.458192378282547,
511
+ "eval_runtime": 9.1255,
512
+ "eval_samples_per_second": 32.875,
513
+ "eval_steps_per_second": 2.082,
514
+ "step": 726
515
+ },
516
+ {
517
+ "epoch": 5.54,
518
+ "eval_exact_match": 14.333333333333334,
519
+ "eval_f1": 19.81243386243386,
520
+ "eval_qa_bleu": 8.961623752889384,
521
+ "eval_qa_exact_match": 0.13666666666666666,
522
+ "eval_recite_bleu": 33.41904095334099,
523
+ "eval_recite_exact_match": 0.0,
524
+ "step": 726
525
+ },
526
+ {
527
+ "epoch": 5.56,
528
+ "learning_rate": 3e-05,
529
+ "loss": 0.229,
530
+ "step": 728
531
+ },
532
+ {
533
+ "epoch": 5.66,
534
+ "learning_rate": 3e-05,
535
+ "loss": 0.2275,
536
+ "step": 742
537
+ },
538
+ {
539
+ "epoch": 5.77,
540
+ "learning_rate": 3e-05,
541
+ "loss": 0.2211,
542
+ "step": 756
543
+ },
544
+ {
545
+ "epoch": 5.88,
546
+ "learning_rate": 3e-05,
547
+ "loss": 0.2231,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 5.98,
552
+ "learning_rate": 3e-05,
553
+ "loss": 0.2157,
554
+ "step": 784
555
+ },
556
+ {
557
+ "epoch": 6.05,
558
+ "eval_accuracy": 0.7358369723435225,
559
+ "eval_loss": 0.4307834804058075,
560
+ "eval_runtime": 9.1847,
561
+ "eval_samples_per_second": 32.663,
562
+ "eval_steps_per_second": 2.069,
563
+ "step": 792
564
+ },
565
+ {
566
+ "epoch": 6.05,
567
+ "eval_exact_match": 16.666666666666668,
568
+ "eval_f1": 21.786772486772488,
569
+ "eval_qa_bleu": 7.511563755726586,
570
+ "eval_qa_exact_match": 0.16,
571
+ "eval_recite_bleu": 37.63384023220464,
572
+ "eval_recite_exact_match": 0.013333333333333334,
573
+ "step": 792
574
+ },
575
+ {
576
+ "epoch": 6.09,
577
+ "learning_rate": 3e-05,
578
+ "loss": 0.1669,
579
+ "step": 798
580
+ },
581
+ {
582
+ "epoch": 6.2,
583
+ "learning_rate": 3e-05,
584
+ "loss": 0.1712,
585
+ "step": 812
586
+ },
587
+ {
588
+ "epoch": 6.31,
589
+ "learning_rate": 3e-05,
590
+ "loss": 0.1601,
591
+ "step": 826
592
+ },
593
+ {
594
+ "epoch": 6.41,
595
+ "learning_rate": 3e-05,
596
+ "loss": 0.1608,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 6.52,
601
+ "learning_rate": 3e-05,
602
+ "loss": 0.1597,
603
+ "step": 854
604
+ },
605
+ {
606
+ "epoch": 6.55,
607
+ "eval_accuracy": 0.7373410965550704,
608
+ "eval_loss": 0.4301389157772064,
609
+ "eval_runtime": 9.7372,
610
+ "eval_samples_per_second": 30.81,
611
+ "eval_steps_per_second": 1.951,
612
+ "step": 858
613
+ },
614
+ {
615
+ "epoch": 6.55,
616
+ "eval_exact_match": 15.666666666666666,
617
+ "eval_f1": 21.056661856661847,
618
+ "eval_qa_bleu": 12.649140852831426,
619
+ "eval_qa_exact_match": 0.14666666666666667,
620
+ "eval_recite_bleu": 42.22665248887737,
621
+ "eval_recite_exact_match": 0.013333333333333334,
622
+ "step": 858
623
+ },
624
+ {
625
+ "epoch": 6.63,
626
+ "learning_rate": 3e-05,
627
+ "loss": 0.1623,
628
+ "step": 868
629
+ },
630
+ {
631
+ "epoch": 6.73,
632
+ "learning_rate": 3e-05,
633
+ "loss": 0.1668,
634
+ "step": 882
635
+ },
636
+ {
637
+ "epoch": 6.84,
638
+ "learning_rate": 3e-05,
639
+ "loss": 0.1624,
640
+ "step": 896
641
+ },
642
+ {
643
+ "epoch": 6.95,
644
+ "learning_rate": 3e-05,
645
+ "loss": 0.1648,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 7.05,
650
+ "learning_rate": 3e-05,
651
+ "loss": 0.1481,
652
+ "step": 924
653
+ },
654
+ {
655
+ "epoch": 7.05,
656
+ "eval_accuracy": 0.7385298398835517,
657
+ "eval_loss": 0.42236796021461487,
658
+ "eval_runtime": 9.3603,
659
+ "eval_samples_per_second": 32.05,
660
+ "eval_steps_per_second": 2.03,
661
+ "step": 924
662
+ },
663
+ {
664
+ "epoch": 7.05,
665
+ "eval_exact_match": 18.666666666666668,
666
+ "eval_f1": 25.187830687830694,
667
+ "eval_qa_bleu": 8.014835952265651,
668
+ "eval_qa_exact_match": 0.18,
669
+ "eval_recite_bleu": 42.38987173856079,
670
+ "eval_recite_exact_match": 0.016666666666666666,
671
+ "step": 924
672
+ },
673
+ {
674
+ "epoch": 7.16,
675
+ "learning_rate": 3e-05,
676
+ "loss": 0.1227,
677
+ "step": 938
678
+ },
679
+ {
680
+ "epoch": 7.27,
681
+ "learning_rate": 3e-05,
682
+ "loss": 0.1272,
683
+ "step": 952
684
+ },
685
+ {
686
+ "epoch": 7.37,
687
+ "learning_rate": 3e-05,
688
+ "loss": 0.1312,
689
+ "step": 966
690
+ },
691
+ {
692
+ "epoch": 7.48,
693
+ "learning_rate": 3e-05,
694
+ "loss": 0.1293,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 7.56,
699
+ "eval_accuracy": 0.739422610383309,
700
+ "eval_loss": 0.41248488426208496,
701
+ "eval_runtime": 9.7486,
702
+ "eval_samples_per_second": 30.774,
703
+ "eval_steps_per_second": 1.949,
704
+ "step": 990
705
+ },
706
+ {
707
+ "epoch": 7.56,
708
+ "eval_exact_match": 15.666666666666666,
709
+ "eval_f1": 22.348196248196246,
710
+ "eval_qa_bleu": 6.260683683577316,
711
+ "eval_qa_exact_match": 0.14666666666666667,
712
+ "eval_recite_bleu": 45.04221504063147,
713
+ "eval_recite_exact_match": 0.023333333333333334,
714
+ "step": 990
715
+ },
716
+ {
717
+ "epoch": 7.59,
718
+ "learning_rate": 3e-05,
719
+ "loss": 0.1345,
720
+ "step": 994
721
+ },
722
+ {
723
+ "epoch": 7.69,
724
+ "learning_rate": 3e-05,
725
+ "loss": 0.1325,
726
+ "step": 1008
727
+ },
728
+ {
729
+ "epoch": 7.8,
730
+ "learning_rate": 3e-05,
731
+ "loss": 0.1273,
732
+ "step": 1022
733
+ },
734
+ {
735
+ "epoch": 7.91,
736
+ "learning_rate": 3e-05,
737
+ "loss": 0.1362,
738
+ "step": 1036
739
+ },
740
+ {
741
+ "epoch": 8.02,
742
+ "learning_rate": 3e-05,
743
+ "loss": 0.125,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 8.06,
748
+ "eval_accuracy": 0.7399902959728287,
749
+ "eval_loss": 0.41223272681236267,
750
+ "eval_runtime": 9.2562,
751
+ "eval_samples_per_second": 32.411,
752
+ "eval_steps_per_second": 2.053,
753
+ "step": 1056
754
+ },
755
+ {
756
+ "epoch": 8.06,
757
+ "eval_exact_match": 18.0,
758
+ "eval_f1": 25.051058201058197,
759
+ "eval_qa_bleu": 10.352483602423003,
760
+ "eval_qa_exact_match": 0.17,
761
+ "eval_recite_bleu": 46.26228887496748,
762
+ "eval_recite_exact_match": 0.04666666666666667,
763
+ "step": 1056
764
+ },
765
+ {
766
+ "epoch": 8.12,
767
+ "learning_rate": 3e-05,
768
+ "loss": 0.11,
769
+ "step": 1064
770
+ },
771
+ {
772
+ "epoch": 8.23,
773
+ "learning_rate": 3e-05,
774
+ "loss": 0.1045,
775
+ "step": 1078
776
+ },
777
+ {
778
+ "epoch": 8.34,
779
+ "learning_rate": 3e-05,
780
+ "loss": 0.1179,
781
+ "step": 1092
782
+ },
783
+ {
784
+ "epoch": 8.44,
785
+ "learning_rate": 3e-05,
786
+ "loss": 0.1083,
787
+ "step": 1106
788
+ },
789
+ {
790
+ "epoch": 8.55,
791
+ "learning_rate": 3e-05,
792
+ "loss": 0.1139,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 8.56,
797
+ "eval_accuracy": 0.7406501698204755,
798
+ "eval_loss": 0.40691348910331726,
799
+ "eval_runtime": 9.0014,
800
+ "eval_samples_per_second": 33.328,
801
+ "eval_steps_per_second": 2.111,
802
+ "step": 1122
803
+ },
804
+ {
805
+ "epoch": 8.56,
806
+ "eval_exact_match": 16.333333333333332,
807
+ "eval_f1": 23.766955266955264,
808
+ "eval_qa_bleu": 11.966740392922118,
809
+ "eval_qa_exact_match": 0.15333333333333332,
810
+ "eval_recite_bleu": 45.74271226096357,
811
+ "eval_recite_exact_match": 0.03333333333333333,
812
+ "step": 1122
813
+ },
814
+ {
815
+ "epoch": 8.66,
816
+ "learning_rate": 3e-05,
817
+ "loss": 0.1108,
818
+ "step": 1134
819
+ },
820
+ {
821
+ "epoch": 8.76,
822
+ "learning_rate": 3e-05,
823
+ "loss": 0.1168,
824
+ "step": 1148
825
+ },
826
+ {
827
+ "epoch": 8.87,
828
+ "learning_rate": 3e-05,
829
+ "loss": 0.1132,
830
+ "step": 1162
831
+ },
832
+ {
833
+ "epoch": 8.98,
834
+ "learning_rate": 3e-05,
835
+ "loss": 0.1141,
836
+ "step": 1176
837
+ }
838
+ ],
839
+ "logging_steps": 14,
840
+ "max_steps": 1310,
841
+ "num_train_epochs": 10,
842
+ "save_steps": 500,
843
+ "total_flos": 5.35677023130624e+16,
844
+ "trial_name": null,
845
+ "trial_params": null
846
+ }
checkpoint-1179/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e2f5bbca2ac6551a76ff4a6e10000fae190ff8cb3817e74f2e51a104a179e48
3
+ size 4728
checkpoint-1179/vocab.json ADDED
The diff for this file is too large to render. See raw diff