tyzhu commited on
Commit
a8ed472
·
verified ·
1 Parent(s): 30ea87d

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +21 -0
  3. eval_results.json +16 -0
  4. train_results.json +8 -0
  5. trainer_state.json +788 -0
README.md CHANGED
@@ -3,11 +3,23 @@ license: mit
3
  base_model: gpt2-xl
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: lmind_hotpot_train1000_eval200_v1_recite_qa_gpt2-xl
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -15,7 +27,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # lmind_hotpot_train1000_eval200_v1_recite_qa_gpt2-xl
17
 
18
- This model is a fine-tuned version of [gpt2-xl](https://huggingface.co/gpt2-xl) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.4436
21
  - Accuracy: 0.6989
 
3
  base_model: gpt2-xl
4
  tags:
5
  - generated_from_trainer
6
+ datasets:
7
+ - tyzhu/lmind_hotpot_train1000_eval200_v1_recite_qa
8
  metrics:
9
  - accuracy
10
  model-index:
11
  - name: lmind_hotpot_train1000_eval200_v1_recite_qa_gpt2-xl
12
+ results:
13
+ - task:
14
+ name: Causal Language Modeling
15
+ type: text-generation
16
+ dataset:
17
+ name: tyzhu/lmind_hotpot_train1000_eval200_v1_recite_qa
18
+ type: tyzhu/lmind_hotpot_train1000_eval200_v1_recite_qa
19
+ metrics:
20
+ - name: Accuracy
21
+ type: accuracy
22
+ value: 0.6988971684053651
23
  ---
24
 
25
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
27
 
28
  # lmind_hotpot_train1000_eval200_v1_recite_qa_gpt2-xl
29
 
30
+ This model is a fine-tuned version of [gpt2-xl](https://huggingface.co/gpt2-xl) on the tyzhu/lmind_hotpot_train1000_eval200_v1_recite_qa dataset.
31
  It achieves the following results on the evaluation set:
32
  - Loss: 0.4436
33
  - Accuracy: 0.6989
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.6988971684053651,
4
+ "eval_exact_match": 20.5,
5
+ "eval_f1": 25.95833333333333,
6
+ "eval_loss": 0.4435840845108032,
7
+ "eval_qa_bleu": 12.082719988904218,
8
+ "eval_qa_exact_match": 0.18,
9
+ "eval_recite_bleu": 44.74289478974747,
10
+ "eval_recite_exact_match": 0.075,
11
+ "eval_runtime": 6.1906,
12
+ "eval_samples": 200,
13
+ "eval_samples_per_second": 32.307,
14
+ "eval_steps_per_second": 2.1,
15
+ "perplexity": 1.5582822371040086,
16
+ "train_loss": 0.29445283661521443,
17
+ "train_runtime": 3197.9615,
18
+ "train_samples": 3373,
19
+ "train_samples_per_second": 10.547,
20
+ "train_steps_per_second": 0.66
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.6988971684053651,
4
+ "eval_exact_match": 20.5,
5
+ "eval_f1": 25.95833333333333,
6
+ "eval_loss": 0.4435840845108032,
7
+ "eval_qa_bleu": 12.082719988904218,
8
+ "eval_qa_exact_match": 0.18,
9
+ "eval_recite_bleu": 44.74289478974747,
10
+ "eval_recite_exact_match": 0.075,
11
+ "eval_runtime": 6.1906,
12
+ "eval_samples": 200,
13
+ "eval_samples_per_second": 32.307,
14
+ "eval_steps_per_second": 2.1,
15
+ "perplexity": 1.5582822371040086
16
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.29445283661521443,
4
+ "train_runtime": 3197.9615,
5
+ "train_samples": 3373,
6
+ "train_samples_per_second": 10.547,
7
+ "train_steps_per_second": 0.66
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,788 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2110,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "learning_rate": 6.226415094339623e-06,
14
+ "loss": 1.1807,
15
+ "step": 22
16
+ },
17
+ {
18
+ "epoch": 0.21,
19
+ "learning_rate": 1.2452830188679246e-05,
20
+ "loss": 1.1325,
21
+ "step": 44
22
+ },
23
+ {
24
+ "epoch": 0.31,
25
+ "learning_rate": 1.8679245283018867e-05,
26
+ "loss": 0.9876,
27
+ "step": 66
28
+ },
29
+ {
30
+ "epoch": 0.42,
31
+ "learning_rate": 2.4905660377358492e-05,
32
+ "loss": 0.873,
33
+ "step": 88
34
+ },
35
+ {
36
+ "epoch": 0.52,
37
+ "learning_rate": 2.994011976047904e-05,
38
+ "loss": 0.7931,
39
+ "step": 110
40
+ },
41
+ {
42
+ "epoch": 0.63,
43
+ "learning_rate": 2.9610778443113774e-05,
44
+ "loss": 0.7764,
45
+ "step": 132
46
+ },
47
+ {
48
+ "epoch": 0.73,
49
+ "learning_rate": 2.9281437125748504e-05,
50
+ "loss": 0.7701,
51
+ "step": 154
52
+ },
53
+ {
54
+ "epoch": 0.83,
55
+ "learning_rate": 2.8952095808383233e-05,
56
+ "loss": 0.739,
57
+ "step": 176
58
+ },
59
+ {
60
+ "epoch": 0.94,
61
+ "learning_rate": 2.8622754491017966e-05,
62
+ "loss": 0.7644,
63
+ "step": 198
64
+ },
65
+ {
66
+ "epoch": 1.0,
67
+ "eval_accuracy": 0.6429210134128167,
68
+ "eval_loss": 1.0036486387252808,
69
+ "eval_runtime": 6.1897,
70
+ "eval_samples_per_second": 32.312,
71
+ "eval_steps_per_second": 2.1,
72
+ "step": 211
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_exact_match": 14.5,
77
+ "eval_f1": 19.727380952380955,
78
+ "eval_qa_bleu": 9.309381343075087,
79
+ "eval_qa_exact_match": 0.14,
80
+ "eval_recite_bleu": 15.818078125717689,
81
+ "eval_recite_exact_match": 0.0,
82
+ "step": 211
83
+ },
84
+ {
85
+ "epoch": 1.04,
86
+ "learning_rate": 2.8293413173652696e-05,
87
+ "loss": 0.6488,
88
+ "step": 220
89
+ },
90
+ {
91
+ "epoch": 1.15,
92
+ "learning_rate": 2.7964071856287425e-05,
93
+ "loss": 0.5286,
94
+ "step": 242
95
+ },
96
+ {
97
+ "epoch": 1.25,
98
+ "learning_rate": 2.763473053892216e-05,
99
+ "loss": 0.5157,
100
+ "step": 264
101
+ },
102
+ {
103
+ "epoch": 1.36,
104
+ "learning_rate": 2.7305389221556884e-05,
105
+ "loss": 0.4838,
106
+ "step": 286
107
+ },
108
+ {
109
+ "epoch": 1.46,
110
+ "learning_rate": 2.6976047904191617e-05,
111
+ "loss": 0.5013,
112
+ "step": 308
113
+ },
114
+ {
115
+ "epoch": 1.56,
116
+ "learning_rate": 2.664670658682635e-05,
117
+ "loss": 0.4649,
118
+ "step": 330
119
+ },
120
+ {
121
+ "epoch": 1.67,
122
+ "learning_rate": 2.6317365269461076e-05,
123
+ "loss": 0.4604,
124
+ "step": 352
125
+ },
126
+ {
127
+ "epoch": 1.77,
128
+ "learning_rate": 2.598802395209581e-05,
129
+ "loss": 0.5064,
130
+ "step": 374
131
+ },
132
+ {
133
+ "epoch": 1.88,
134
+ "learning_rate": 2.565868263473054e-05,
135
+ "loss": 0.4732,
136
+ "step": 396
137
+ },
138
+ {
139
+ "epoch": 1.98,
140
+ "learning_rate": 2.5329341317365268e-05,
141
+ "loss": 0.526,
142
+ "step": 418
143
+ },
144
+ {
145
+ "epoch": 2.0,
146
+ "eval_accuracy": 0.6657600596125186,
147
+ "eval_loss": 0.7350580096244812,
148
+ "eval_runtime": 6.5425,
149
+ "eval_samples_per_second": 30.569,
150
+ "eval_steps_per_second": 1.987,
151
+ "step": 422
152
+ },
153
+ {
154
+ "epoch": 2.0,
155
+ "eval_exact_match": 12.5,
156
+ "eval_f1": 19.69947691197691,
157
+ "eval_qa_bleu": 9.452490656702446,
158
+ "eval_qa_exact_match": 0.12,
159
+ "eval_recite_bleu": 17.493375608153375,
160
+ "eval_recite_exact_match": 0.0,
161
+ "step": 422
162
+ },
163
+ {
164
+ "epoch": 2.09,
165
+ "learning_rate": 2.5e-05,
166
+ "loss": 0.4741,
167
+ "step": 440
168
+ },
169
+ {
170
+ "epoch": 2.19,
171
+ "learning_rate": 2.467065868263473e-05,
172
+ "loss": 0.4338,
173
+ "step": 462
174
+ },
175
+ {
176
+ "epoch": 2.29,
177
+ "learning_rate": 2.4341317365269464e-05,
178
+ "loss": 0.4451,
179
+ "step": 484
180
+ },
181
+ {
182
+ "epoch": 2.4,
183
+ "learning_rate": 2.4011976047904193e-05,
184
+ "loss": 0.4312,
185
+ "step": 506
186
+ },
187
+ {
188
+ "epoch": 2.5,
189
+ "learning_rate": 2.3682634730538923e-05,
190
+ "loss": 0.4363,
191
+ "step": 528
192
+ },
193
+ {
194
+ "epoch": 2.61,
195
+ "learning_rate": 2.3353293413173656e-05,
196
+ "loss": 0.4302,
197
+ "step": 550
198
+ },
199
+ {
200
+ "epoch": 2.71,
201
+ "learning_rate": 2.302395209580838e-05,
202
+ "loss": 0.4528,
203
+ "step": 572
204
+ },
205
+ {
206
+ "epoch": 2.82,
207
+ "learning_rate": 2.2694610778443115e-05,
208
+ "loss": 0.4086,
209
+ "step": 594
210
+ },
211
+ {
212
+ "epoch": 2.92,
213
+ "learning_rate": 2.2365269461077847e-05,
214
+ "loss": 0.4163,
215
+ "step": 616
216
+ },
217
+ {
218
+ "epoch": 3.0,
219
+ "eval_accuracy": 0.6815499254843517,
220
+ "eval_loss": 0.5743635296821594,
221
+ "eval_runtime": 6.2699,
222
+ "eval_samples_per_second": 31.898,
223
+ "eval_steps_per_second": 2.073,
224
+ "step": 633
225
+ },
226
+ {
227
+ "epoch": 3.0,
228
+ "eval_exact_match": 13.0,
229
+ "eval_f1": 20.81829004329004,
230
+ "eval_qa_bleu": 7.245823931129993,
231
+ "eval_qa_exact_match": 0.12,
232
+ "eval_recite_bleu": 22.248138618982026,
233
+ "eval_recite_exact_match": 0.0,
234
+ "step": 633
235
+ },
236
+ {
237
+ "epoch": 3.02,
238
+ "learning_rate": 2.2035928143712574e-05,
239
+ "loss": 0.3866,
240
+ "step": 638
241
+ },
242
+ {
243
+ "epoch": 3.13,
244
+ "learning_rate": 2.1706586826347306e-05,
245
+ "loss": 0.2789,
246
+ "step": 660
247
+ },
248
+ {
249
+ "epoch": 3.23,
250
+ "learning_rate": 2.1377245508982036e-05,
251
+ "loss": 0.2932,
252
+ "step": 682
253
+ },
254
+ {
255
+ "epoch": 3.34,
256
+ "learning_rate": 2.1047904191616766e-05,
257
+ "loss": 0.2883,
258
+ "step": 704
259
+ },
260
+ {
261
+ "epoch": 3.44,
262
+ "learning_rate": 2.07185628742515e-05,
263
+ "loss": 0.3048,
264
+ "step": 726
265
+ },
266
+ {
267
+ "epoch": 3.55,
268
+ "learning_rate": 2.0389221556886228e-05,
269
+ "loss": 0.28,
270
+ "step": 748
271
+ },
272
+ {
273
+ "epoch": 3.65,
274
+ "learning_rate": 2.0059880239520957e-05,
275
+ "loss": 0.2847,
276
+ "step": 770
277
+ },
278
+ {
279
+ "epoch": 3.75,
280
+ "learning_rate": 1.9730538922155687e-05,
281
+ "loss": 0.2949,
282
+ "step": 792
283
+ },
284
+ {
285
+ "epoch": 3.86,
286
+ "learning_rate": 1.940119760479042e-05,
287
+ "loss": 0.3002,
288
+ "step": 814
289
+ },
290
+ {
291
+ "epoch": 3.96,
292
+ "learning_rate": 1.9071856287425153e-05,
293
+ "loss": 0.2864,
294
+ "step": 836
295
+ },
296
+ {
297
+ "epoch": 4.0,
298
+ "eval_accuracy": 0.689903129657228,
299
+ "eval_loss": 0.4952711760997772,
300
+ "eval_runtime": 6.337,
301
+ "eval_samples_per_second": 31.561,
302
+ "eval_steps_per_second": 2.051,
303
+ "step": 844
304
+ },
305
+ {
306
+ "epoch": 4.0,
307
+ "eval_exact_match": 17.0,
308
+ "eval_f1": 22.154761904761905,
309
+ "eval_qa_bleu": 10.09488722739415,
310
+ "eval_qa_exact_match": 0.155,
311
+ "eval_recite_bleu": 27.91257178143564,
312
+ "eval_recite_exact_match": 0.005,
313
+ "step": 844
314
+ },
315
+ {
316
+ "epoch": 4.07,
317
+ "learning_rate": 1.874251497005988e-05,
318
+ "loss": 0.2368,
319
+ "step": 858
320
+ },
321
+ {
322
+ "epoch": 4.17,
323
+ "learning_rate": 1.8413173652694612e-05,
324
+ "loss": 0.203,
325
+ "step": 880
326
+ },
327
+ {
328
+ "epoch": 4.27,
329
+ "learning_rate": 1.8083832335329345e-05,
330
+ "loss": 0.2092,
331
+ "step": 902
332
+ },
333
+ {
334
+ "epoch": 4.38,
335
+ "learning_rate": 1.775449101796407e-05,
336
+ "loss": 0.2188,
337
+ "step": 924
338
+ },
339
+ {
340
+ "epoch": 4.48,
341
+ "learning_rate": 1.7425149700598804e-05,
342
+ "loss": 0.2092,
343
+ "step": 946
344
+ },
345
+ {
346
+ "epoch": 4.59,
347
+ "learning_rate": 1.7095808383233533e-05,
348
+ "loss": 0.2144,
349
+ "step": 968
350
+ },
351
+ {
352
+ "epoch": 4.69,
353
+ "learning_rate": 1.6766467065868263e-05,
354
+ "loss": 0.2061,
355
+ "step": 990
356
+ },
357
+ {
358
+ "epoch": 4.8,
359
+ "learning_rate": 1.6437125748502996e-05,
360
+ "loss": 0.2161,
361
+ "step": 1012
362
+ },
363
+ {
364
+ "epoch": 4.9,
365
+ "learning_rate": 1.6107784431137725e-05,
366
+ "loss": 0.2118,
367
+ "step": 1034
368
+ },
369
+ {
370
+ "epoch": 5.0,
371
+ "eval_accuracy": 0.6943889716840537,
372
+ "eval_loss": 0.45938587188720703,
373
+ "eval_runtime": 6.2541,
374
+ "eval_samples_per_second": 31.979,
375
+ "eval_steps_per_second": 2.079,
376
+ "step": 1055
377
+ },
378
+ {
379
+ "epoch": 5.0,
380
+ "eval_exact_match": 16.0,
381
+ "eval_f1": 22.22857142857142,
382
+ "eval_qa_bleu": 10.69246525438524,
383
+ "eval_qa_exact_match": 0.14,
384
+ "eval_recite_bleu": 33.64683237936858,
385
+ "eval_recite_exact_match": 0.025,
386
+ "step": 1055
387
+ },
388
+ {
389
+ "epoch": 5.0,
390
+ "learning_rate": 1.5778443113772455e-05,
391
+ "loss": 0.2129,
392
+ "step": 1056
393
+ },
394
+ {
395
+ "epoch": 5.11,
396
+ "learning_rate": 1.5449101796407184e-05,
397
+ "loss": 0.1559,
398
+ "step": 1078
399
+ },
400
+ {
401
+ "epoch": 5.21,
402
+ "learning_rate": 1.5119760479041917e-05,
403
+ "loss": 0.1631,
404
+ "step": 1100
405
+ },
406
+ {
407
+ "epoch": 5.32,
408
+ "learning_rate": 1.4790419161676647e-05,
409
+ "loss": 0.1614,
410
+ "step": 1122
411
+ },
412
+ {
413
+ "epoch": 5.42,
414
+ "learning_rate": 1.4461077844311378e-05,
415
+ "loss": 0.1636,
416
+ "step": 1144
417
+ },
418
+ {
419
+ "epoch": 5.53,
420
+ "learning_rate": 1.4131736526946109e-05,
421
+ "loss": 0.1645,
422
+ "step": 1166
423
+ },
424
+ {
425
+ "epoch": 5.63,
426
+ "learning_rate": 1.3802395209580839e-05,
427
+ "loss": 0.1613,
428
+ "step": 1188
429
+ },
430
+ {
431
+ "epoch": 5.73,
432
+ "learning_rate": 1.3473053892215568e-05,
433
+ "loss": 0.1724,
434
+ "step": 1210
435
+ },
436
+ {
437
+ "epoch": 5.84,
438
+ "learning_rate": 1.31437125748503e-05,
439
+ "loss": 0.1643,
440
+ "step": 1232
441
+ },
442
+ {
443
+ "epoch": 5.94,
444
+ "learning_rate": 1.281437125748503e-05,
445
+ "loss": 0.17,
446
+ "step": 1254
447
+ },
448
+ {
449
+ "epoch": 6.0,
450
+ "eval_accuracy": 0.6964456035767511,
451
+ "eval_loss": 0.44896385073661804,
452
+ "eval_runtime": 6.3718,
453
+ "eval_samples_per_second": 31.389,
454
+ "eval_steps_per_second": 2.04,
455
+ "step": 1266
456
+ },
457
+ {
458
+ "epoch": 6.0,
459
+ "eval_exact_match": 15.0,
460
+ "eval_f1": 21.070238095238086,
461
+ "eval_qa_bleu": 11.294234950233255,
462
+ "eval_qa_exact_match": 0.13,
463
+ "eval_recite_bleu": 35.4728355349377,
464
+ "eval_recite_exact_match": 0.025,
465
+ "step": 1266
466
+ },
467
+ {
468
+ "epoch": 6.05,
469
+ "learning_rate": 1.2485029940119762e-05,
470
+ "loss": 0.1441,
471
+ "step": 1276
472
+ },
473
+ {
474
+ "epoch": 6.15,
475
+ "learning_rate": 1.2155688622754491e-05,
476
+ "loss": 0.1356,
477
+ "step": 1298
478
+ },
479
+ {
480
+ "epoch": 6.26,
481
+ "learning_rate": 1.182634730538922e-05,
482
+ "loss": 0.1363,
483
+ "step": 1320
484
+ },
485
+ {
486
+ "epoch": 6.36,
487
+ "learning_rate": 1.1497005988023954e-05,
488
+ "loss": 0.137,
489
+ "step": 1342
490
+ },
491
+ {
492
+ "epoch": 6.46,
493
+ "learning_rate": 1.1167664670658683e-05,
494
+ "loss": 0.1382,
495
+ "step": 1364
496
+ },
497
+ {
498
+ "epoch": 6.57,
499
+ "learning_rate": 1.0838323353293413e-05,
500
+ "loss": 0.1416,
501
+ "step": 1386
502
+ },
503
+ {
504
+ "epoch": 6.67,
505
+ "learning_rate": 1.0508982035928144e-05,
506
+ "loss": 0.1389,
507
+ "step": 1408
508
+ },
509
+ {
510
+ "epoch": 6.78,
511
+ "learning_rate": 1.0179640718562873e-05,
512
+ "loss": 0.1392,
513
+ "step": 1430
514
+ },
515
+ {
516
+ "epoch": 6.88,
517
+ "learning_rate": 9.850299401197606e-06,
518
+ "loss": 0.1398,
519
+ "step": 1452
520
+ },
521
+ {
522
+ "epoch": 6.99,
523
+ "learning_rate": 9.520958083832336e-06,
524
+ "loss": 0.134,
525
+ "step": 1474
526
+ },
527
+ {
528
+ "epoch": 7.0,
529
+ "eval_accuracy": 0.697876304023845,
530
+ "eval_loss": 0.43685418367385864,
531
+ "eval_runtime": 6.2539,
532
+ "eval_samples_per_second": 31.98,
533
+ "eval_steps_per_second": 2.079,
534
+ "step": 1477
535
+ },
536
+ {
537
+ "epoch": 7.0,
538
+ "eval_exact_match": 16.5,
539
+ "eval_f1": 22.119444444444444,
540
+ "eval_qa_bleu": 13.989667861778496,
541
+ "eval_qa_exact_match": 0.14,
542
+ "eval_recite_bleu": 42.188868266934165,
543
+ "eval_recite_exact_match": 0.04,
544
+ "step": 1477
545
+ },
546
+ {
547
+ "epoch": 7.09,
548
+ "learning_rate": 9.191616766467065e-06,
549
+ "loss": 0.1188,
550
+ "step": 1496
551
+ },
552
+ {
553
+ "epoch": 7.19,
554
+ "learning_rate": 8.862275449101796e-06,
555
+ "loss": 0.1207,
556
+ "step": 1518
557
+ },
558
+ {
559
+ "epoch": 7.3,
560
+ "learning_rate": 8.532934131736528e-06,
561
+ "loss": 0.1223,
562
+ "step": 1540
563
+ },
564
+ {
565
+ "epoch": 7.4,
566
+ "learning_rate": 8.203592814371257e-06,
567
+ "loss": 0.115,
568
+ "step": 1562
569
+ },
570
+ {
571
+ "epoch": 7.51,
572
+ "learning_rate": 7.874251497005988e-06,
573
+ "loss": 0.1139,
574
+ "step": 1584
575
+ },
576
+ {
577
+ "epoch": 7.61,
578
+ "learning_rate": 7.544910179640718e-06,
579
+ "loss": 0.117,
580
+ "step": 1606
581
+ },
582
+ {
583
+ "epoch": 7.72,
584
+ "learning_rate": 7.215568862275449e-06,
585
+ "loss": 0.1203,
586
+ "step": 1628
587
+ },
588
+ {
589
+ "epoch": 7.82,
590
+ "learning_rate": 6.8862275449101795e-06,
591
+ "loss": 0.1229,
592
+ "step": 1650
593
+ },
594
+ {
595
+ "epoch": 7.92,
596
+ "learning_rate": 6.556886227544911e-06,
597
+ "loss": 0.1206,
598
+ "step": 1672
599
+ },
600
+ {
601
+ "epoch": 8.0,
602
+ "eval_accuracy": 0.6986959761549926,
603
+ "eval_loss": 0.43722403049468994,
604
+ "eval_runtime": 6.191,
605
+ "eval_samples_per_second": 32.305,
606
+ "eval_steps_per_second": 2.1,
607
+ "step": 1688
608
+ },
609
+ {
610
+ "epoch": 8.0,
611
+ "eval_exact_match": 20.0,
612
+ "eval_f1": 25.83571428571428,
613
+ "eval_qa_bleu": 8.333490045944334,
614
+ "eval_qa_exact_match": 0.18,
615
+ "eval_recite_bleu": 43.68492978075338,
616
+ "eval_recite_exact_match": 0.065,
617
+ "step": 1688
618
+ },
619
+ {
620
+ "epoch": 8.03,
621
+ "learning_rate": 6.22754491017964e-06,
622
+ "loss": 0.1145,
623
+ "step": 1694
624
+ },
625
+ {
626
+ "epoch": 8.13,
627
+ "learning_rate": 5.898203592814371e-06,
628
+ "loss": 0.1066,
629
+ "step": 1716
630
+ },
631
+ {
632
+ "epoch": 8.24,
633
+ "learning_rate": 5.568862275449102e-06,
634
+ "loss": 0.1064,
635
+ "step": 1738
636
+ },
637
+ {
638
+ "epoch": 8.34,
639
+ "learning_rate": 5.239520958083833e-06,
640
+ "loss": 0.1077,
641
+ "step": 1760
642
+ },
643
+ {
644
+ "epoch": 8.45,
645
+ "learning_rate": 4.9101796407185625e-06,
646
+ "loss": 0.1108,
647
+ "step": 1782
648
+ },
649
+ {
650
+ "epoch": 8.55,
651
+ "learning_rate": 4.580838323353294e-06,
652
+ "loss": 0.1078,
653
+ "step": 1804
654
+ },
655
+ {
656
+ "epoch": 8.65,
657
+ "learning_rate": 4.251497005988024e-06,
658
+ "loss": 0.1072,
659
+ "step": 1826
660
+ },
661
+ {
662
+ "epoch": 8.76,
663
+ "learning_rate": 3.922155688622755e-06,
664
+ "loss": 0.1098,
665
+ "step": 1848
666
+ },
667
+ {
668
+ "epoch": 8.86,
669
+ "learning_rate": 3.592814371257485e-06,
670
+ "loss": 0.1093,
671
+ "step": 1870
672
+ },
673
+ {
674
+ "epoch": 8.97,
675
+ "learning_rate": 3.2634730538922155e-06,
676
+ "loss": 0.1081,
677
+ "step": 1892
678
+ },
679
+ {
680
+ "epoch": 9.0,
681
+ "eval_accuracy": 0.698725782414307,
682
+ "eval_loss": 0.44227516651153564,
683
+ "eval_runtime": 6.2856,
684
+ "eval_samples_per_second": 31.819,
685
+ "eval_steps_per_second": 2.068,
686
+ "step": 1899
687
+ },
688
+ {
689
+ "epoch": 9.0,
690
+ "eval_exact_match": 17.5,
691
+ "eval_f1": 22.30357142857143,
692
+ "eval_qa_bleu": 10.817736383091892,
693
+ "eval_qa_exact_match": 0.15,
694
+ "eval_recite_bleu": 43.37666725316897,
695
+ "eval_recite_exact_match": 0.06,
696
+ "step": 1899
697
+ },
698
+ {
699
+ "epoch": 9.07,
700
+ "learning_rate": 2.9341317365269463e-06,
701
+ "loss": 0.1079,
702
+ "step": 1914
703
+ },
704
+ {
705
+ "epoch": 9.18,
706
+ "learning_rate": 2.6047904191616767e-06,
707
+ "loss": 0.102,
708
+ "step": 1936
709
+ },
710
+ {
711
+ "epoch": 9.28,
712
+ "learning_rate": 2.2754491017964075e-06,
713
+ "loss": 0.1003,
714
+ "step": 1958
715
+ },
716
+ {
717
+ "epoch": 9.38,
718
+ "learning_rate": 1.9461077844311374e-06,
719
+ "loss": 0.1023,
720
+ "step": 1980
721
+ },
722
+ {
723
+ "epoch": 9.49,
724
+ "learning_rate": 1.6167664670658684e-06,
725
+ "loss": 0.1003,
726
+ "step": 2002
727
+ },
728
+ {
729
+ "epoch": 9.59,
730
+ "learning_rate": 1.287425149700599e-06,
731
+ "loss": 0.1058,
732
+ "step": 2024
733
+ },
734
+ {
735
+ "epoch": 9.7,
736
+ "learning_rate": 9.580838323353293e-07,
737
+ "loss": 0.1067,
738
+ "step": 2046
739
+ },
740
+ {
741
+ "epoch": 9.8,
742
+ "learning_rate": 6.287425149700599e-07,
743
+ "loss": 0.1047,
744
+ "step": 2068
745
+ },
746
+ {
747
+ "epoch": 9.91,
748
+ "learning_rate": 2.994011976047904e-07,
749
+ "loss": 0.1053,
750
+ "step": 2090
751
+ },
752
+ {
753
+ "epoch": 10.0,
754
+ "eval_accuracy": 0.6988971684053651,
755
+ "eval_loss": 0.4435840845108032,
756
+ "eval_runtime": 6.3653,
757
+ "eval_samples_per_second": 31.42,
758
+ "eval_steps_per_second": 2.042,
759
+ "step": 2110
760
+ },
761
+ {
762
+ "epoch": 10.0,
763
+ "eval_exact_match": 20.5,
764
+ "eval_f1": 25.95833333333333,
765
+ "eval_qa_bleu": 12.082719988904218,
766
+ "eval_qa_exact_match": 0.18,
767
+ "eval_recite_bleu": 44.74289478974747,
768
+ "eval_recite_exact_match": 0.075,
769
+ "step": 2110
770
+ },
771
+ {
772
+ "epoch": 10.0,
773
+ "step": 2110,
774
+ "total_flos": 9.64020042848256e+16,
775
+ "train_loss": 0.29445283661521443,
776
+ "train_runtime": 3197.9615,
777
+ "train_samples_per_second": 10.547,
778
+ "train_steps_per_second": 0.66
779
+ }
780
+ ],
781
+ "logging_steps": 22,
782
+ "max_steps": 2110,
783
+ "num_train_epochs": 10,
784
+ "save_steps": 500,
785
+ "total_flos": 9.64020042848256e+16,
786
+ "trial_name": null,
787
+ "trial_params": null
788
+ }