tyzhu commited on
Commit
50f88e6
·
verified ·
1 Parent(s): c70e38c

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +21 -0
  3. eval_results.json +16 -0
  4. train_results.json +8 -0
  5. trainer_state.json +806 -0
README.md CHANGED
@@ -3,11 +3,23 @@ license: mit
3
  base_model: gpt2-xl
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: lmind_hotpot_train300_eval100_v1_recite_qa_gpt2-xl
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -15,7 +27,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # lmind_hotpot_train300_eval100_v1_recite_qa_gpt2-xl
17
 
18
- This model is a fine-tuned version of [gpt2-xl](https://huggingface.co/gpt2-xl) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.4199
21
  - Accuracy: 0.6908
 
3
  base_model: gpt2-xl
4
  tags:
5
  - generated_from_trainer
6
+ datasets:
7
+ - tyzhu/lmind_hotpot_train300_eval100_v1_recite_qa
8
  metrics:
9
  - accuracy
10
  model-index:
11
  - name: lmind_hotpot_train300_eval100_v1_recite_qa_gpt2-xl
12
+ results:
13
+ - task:
14
+ name: Causal Language Modeling
15
+ type: text-generation
16
+ dataset:
17
+ name: tyzhu/lmind_hotpot_train300_eval100_v1_recite_qa
18
+ type: tyzhu/lmind_hotpot_train300_eval100_v1_recite_qa
19
+ metrics:
20
+ - name: Accuracy
21
+ type: accuracy
22
+ value: 0.6908442503639011
23
  ---
24
 
25
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
27
 
28
  # lmind_hotpot_train300_eval100_v1_recite_qa_gpt2-xl
29
 
30
+ This model is a fine-tuned version of [gpt2-xl](https://huggingface.co/gpt2-xl) on the tyzhu/lmind_hotpot_train300_eval100_v1_recite_qa dataset.
31
  It achieves the following results on the evaluation set:
32
  - Loss: 0.4199
33
  - Accuracy: 0.6908
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.6908442503639011,
4
+ "eval_exact_match": 19.0,
5
+ "eval_f1": 24.855555555555558,
6
+ "eval_loss": 0.4198664426803589,
7
+ "eval_qa_bleu": 14.403045967487259,
8
+ "eval_qa_exact_match": 0.18,
9
+ "eval_recite_bleu": 45.62406704093971,
10
+ "eval_recite_exact_match": 0.02,
11
+ "eval_runtime": 3.2494,
12
+ "eval_samples": 100,
13
+ "eval_samples_per_second": 30.775,
14
+ "eval_steps_per_second": 2.154,
15
+ "perplexity": 1.5217583000861028,
16
+ "train_loss": 0.7305109727641811,
17
+ "train_runtime": 1813.2807,
18
+ "train_samples": 1097,
19
+ "train_samples_per_second": 6.05,
20
+ "train_steps_per_second": 0.381
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.6908442503639011,
4
+ "eval_exact_match": 19.0,
5
+ "eval_f1": 24.855555555555558,
6
+ "eval_loss": 0.4198664426803589,
7
+ "eval_qa_bleu": 14.403045967487259,
8
+ "eval_qa_exact_match": 0.18,
9
+ "eval_recite_bleu": 45.62406704093971,
10
+ "eval_recite_exact_match": 0.02,
11
+ "eval_runtime": 3.2494,
12
+ "eval_samples": 100,
13
+ "eval_samples_per_second": 30.775,
14
+ "eval_steps_per_second": 2.154,
15
+ "perplexity": 1.5217583000861028
16
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.7305109727641811,
4
+ "train_runtime": 1813.2807,
5
+ "train_samples": 1097,
6
+ "train_samples_per_second": 6.05,
7
+ "train_steps_per_second": 0.381
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,806 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 690,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "learning_rate": 6e-06,
14
+ "loss": 7.4689,
15
+ "step": 7
16
+ },
17
+ {
18
+ "epoch": 0.2,
19
+ "learning_rate": 1.2e-05,
20
+ "loss": 5.1854,
21
+ "step": 14
22
+ },
23
+ {
24
+ "epoch": 0.3,
25
+ "learning_rate": 1.8e-05,
26
+ "loss": 3.962,
27
+ "step": 21
28
+ },
29
+ {
30
+ "epoch": 0.41,
31
+ "learning_rate": 2.4e-05,
32
+ "loss": 2.9971,
33
+ "step": 28
34
+ },
35
+ {
36
+ "epoch": 0.51,
37
+ "learning_rate": 3e-05,
38
+ "loss": 2.342,
39
+ "step": 35
40
+ },
41
+ {
42
+ "epoch": 0.61,
43
+ "learning_rate": 2.9679389312977098e-05,
44
+ "loss": 2.2103,
45
+ "step": 42
46
+ },
47
+ {
48
+ "epoch": 0.71,
49
+ "learning_rate": 2.93587786259542e-05,
50
+ "loss": 2.1337,
51
+ "step": 49
52
+ },
53
+ {
54
+ "epoch": 0.81,
55
+ "learning_rate": 2.9038167938931298e-05,
56
+ "loss": 2.0715,
57
+ "step": 56
58
+ },
59
+ {
60
+ "epoch": 0.91,
61
+ "learning_rate": 2.8717557251908395e-05,
62
+ "loss": 1.9548,
63
+ "step": 63
64
+ },
65
+ {
66
+ "epoch": 1.0,
67
+ "eval_accuracy": 0.5812809315866084,
68
+ "eval_loss": 1.7231289148330688,
69
+ "eval_runtime": 9.0359,
70
+ "eval_samples_per_second": 11.067,
71
+ "eval_steps_per_second": 0.775,
72
+ "step": 69
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_exact_match": 6.0,
77
+ "eval_f1": 10.928571428571429,
78
+ "eval_qa_bleu": 3.6846142429687943,
79
+ "eval_qa_exact_match": 0.06,
80
+ "eval_recite_bleu": 12.518352954015292,
81
+ "eval_recite_exact_match": 0.0,
82
+ "step": 69
83
+ },
84
+ {
85
+ "epoch": 1.01,
86
+ "learning_rate": 2.8396946564885498e-05,
87
+ "loss": 1.9164,
88
+ "step": 70
89
+ },
90
+ {
91
+ "epoch": 1.12,
92
+ "learning_rate": 2.8076335877862595e-05,
93
+ "loss": 1.734,
94
+ "step": 77
95
+ },
96
+ {
97
+ "epoch": 1.22,
98
+ "learning_rate": 2.7755725190839695e-05,
99
+ "loss": 1.5936,
100
+ "step": 84
101
+ },
102
+ {
103
+ "epoch": 1.32,
104
+ "learning_rate": 2.7435114503816795e-05,
105
+ "loss": 1.4865,
106
+ "step": 91
107
+ },
108
+ {
109
+ "epoch": 1.42,
110
+ "learning_rate": 2.7114503816793892e-05,
111
+ "loss": 1.5131,
112
+ "step": 98
113
+ },
114
+ {
115
+ "epoch": 1.52,
116
+ "learning_rate": 2.6793893129770996e-05,
117
+ "loss": 1.4826,
118
+ "step": 105
119
+ },
120
+ {
121
+ "epoch": 1.62,
122
+ "learning_rate": 2.6473282442748093e-05,
123
+ "loss": 1.5076,
124
+ "step": 112
125
+ },
126
+ {
127
+ "epoch": 1.72,
128
+ "learning_rate": 2.615267175572519e-05,
129
+ "loss": 1.3919,
130
+ "step": 119
131
+ },
132
+ {
133
+ "epoch": 1.83,
134
+ "learning_rate": 2.5832061068702293e-05,
135
+ "loss": 1.458,
136
+ "step": 126
137
+ },
138
+ {
139
+ "epoch": 1.93,
140
+ "learning_rate": 2.551145038167939e-05,
141
+ "loss": 1.3306,
142
+ "step": 133
143
+ },
144
+ {
145
+ "epoch": 2.0,
146
+ "eval_accuracy": 0.6136244541484717,
147
+ "eval_loss": 1.2326337099075317,
148
+ "eval_runtime": 3.2768,
149
+ "eval_samples_per_second": 30.517,
150
+ "eval_steps_per_second": 2.136,
151
+ "step": 138
152
+ },
153
+ {
154
+ "epoch": 2.0,
155
+ "eval_exact_match": 9.0,
156
+ "eval_f1": 18.469047619047622,
157
+ "eval_qa_bleu": 10.388842490007553,
158
+ "eval_qa_exact_match": 0.08,
159
+ "eval_recite_bleu": 18.414656875876464,
160
+ "eval_recite_exact_match": 0.0,
161
+ "step": 138
162
+ },
163
+ {
164
+ "epoch": 2.03,
165
+ "learning_rate": 2.5190839694656487e-05,
166
+ "loss": 1.2798,
167
+ "step": 140
168
+ },
169
+ {
170
+ "epoch": 2.13,
171
+ "learning_rate": 2.487022900763359e-05,
172
+ "loss": 0.911,
173
+ "step": 147
174
+ },
175
+ {
176
+ "epoch": 2.23,
177
+ "learning_rate": 2.4549618320610687e-05,
178
+ "loss": 0.9711,
179
+ "step": 154
180
+ },
181
+ {
182
+ "epoch": 2.33,
183
+ "learning_rate": 2.4229007633587784e-05,
184
+ "loss": 0.9432,
185
+ "step": 161
186
+ },
187
+ {
188
+ "epoch": 2.43,
189
+ "learning_rate": 2.3908396946564887e-05,
190
+ "loss": 0.8923,
191
+ "step": 168
192
+ },
193
+ {
194
+ "epoch": 2.54,
195
+ "learning_rate": 2.3587786259541984e-05,
196
+ "loss": 0.8927,
197
+ "step": 175
198
+ },
199
+ {
200
+ "epoch": 2.64,
201
+ "learning_rate": 2.3267175572519084e-05,
202
+ "loss": 0.8711,
203
+ "step": 182
204
+ },
205
+ {
206
+ "epoch": 2.74,
207
+ "learning_rate": 2.2946564885496185e-05,
208
+ "loss": 0.8087,
209
+ "step": 189
210
+ },
211
+ {
212
+ "epoch": 2.84,
213
+ "learning_rate": 2.262595419847328e-05,
214
+ "loss": 0.8442,
215
+ "step": 196
216
+ },
217
+ {
218
+ "epoch": 2.94,
219
+ "learning_rate": 2.2305343511450385e-05,
220
+ "loss": 0.8853,
221
+ "step": 203
222
+ },
223
+ {
224
+ "epoch": 3.0,
225
+ "eval_accuracy": 0.6420524017467248,
226
+ "eval_loss": 0.8816230297088623,
227
+ "eval_runtime": 3.3365,
228
+ "eval_samples_per_second": 29.971,
229
+ "eval_steps_per_second": 2.098,
230
+ "step": 207
231
+ },
232
+ {
233
+ "epoch": 3.0,
234
+ "eval_exact_match": 9.0,
235
+ "eval_f1": 16.49920634920635,
236
+ "eval_qa_bleu": 8.802610551775999,
237
+ "eval_qa_exact_match": 0.09,
238
+ "eval_recite_bleu": 22.99064826237382,
239
+ "eval_recite_exact_match": 0.0,
240
+ "step": 207
241
+ },
242
+ {
243
+ "epoch": 3.04,
244
+ "learning_rate": 2.198473282442748e-05,
245
+ "loss": 0.6919,
246
+ "step": 210
247
+ },
248
+ {
249
+ "epoch": 3.14,
250
+ "learning_rate": 2.166412213740458e-05,
251
+ "loss": 0.5836,
252
+ "step": 217
253
+ },
254
+ {
255
+ "epoch": 3.25,
256
+ "learning_rate": 2.1343511450381682e-05,
257
+ "loss": 0.5566,
258
+ "step": 224
259
+ },
260
+ {
261
+ "epoch": 3.35,
262
+ "learning_rate": 2.102290076335878e-05,
263
+ "loss": 0.5625,
264
+ "step": 231
265
+ },
266
+ {
267
+ "epoch": 3.45,
268
+ "learning_rate": 2.0702290076335876e-05,
269
+ "loss": 0.5301,
270
+ "step": 238
271
+ },
272
+ {
273
+ "epoch": 3.55,
274
+ "learning_rate": 2.038167938931298e-05,
275
+ "loss": 0.5601,
276
+ "step": 245
277
+ },
278
+ {
279
+ "epoch": 3.65,
280
+ "learning_rate": 2.0061068702290076e-05,
281
+ "loss": 0.5088,
282
+ "step": 252
283
+ },
284
+ {
285
+ "epoch": 3.75,
286
+ "learning_rate": 1.9740458015267176e-05,
287
+ "loss": 0.5661,
288
+ "step": 259
289
+ },
290
+ {
291
+ "epoch": 3.86,
292
+ "learning_rate": 1.9419847328244276e-05,
293
+ "loss": 0.5007,
294
+ "step": 266
295
+ },
296
+ {
297
+ "epoch": 3.96,
298
+ "learning_rate": 1.9099236641221373e-05,
299
+ "loss": 0.5181,
300
+ "step": 273
301
+ },
302
+ {
303
+ "epoch": 4.0,
304
+ "eval_accuracy": 0.6637554585152838,
305
+ "eval_loss": 0.6443885564804077,
306
+ "eval_runtime": 3.1883,
307
+ "eval_samples_per_second": 31.365,
308
+ "eval_steps_per_second": 2.196,
309
+ "step": 276
310
+ },
311
+ {
312
+ "epoch": 4.0,
313
+ "eval_exact_match": 9.0,
314
+ "eval_f1": 15.252380952380953,
315
+ "eval_qa_bleu": 4.187460029928345,
316
+ "eval_qa_exact_match": 0.09,
317
+ "eval_recite_bleu": 24.09222861395294,
318
+ "eval_recite_exact_match": 0.0,
319
+ "step": 276
320
+ },
321
+ {
322
+ "epoch": 4.06,
323
+ "learning_rate": 1.8778625954198473e-05,
324
+ "loss": 0.4274,
325
+ "step": 280
326
+ },
327
+ {
328
+ "epoch": 4.16,
329
+ "learning_rate": 1.8458015267175574e-05,
330
+ "loss": 0.3626,
331
+ "step": 287
332
+ },
333
+ {
334
+ "epoch": 4.26,
335
+ "learning_rate": 1.813740458015267e-05,
336
+ "loss": 0.3519,
337
+ "step": 294
338
+ },
339
+ {
340
+ "epoch": 4.36,
341
+ "learning_rate": 1.7816793893129774e-05,
342
+ "loss": 0.3387,
343
+ "step": 301
344
+ },
345
+ {
346
+ "epoch": 4.46,
347
+ "learning_rate": 1.749618320610687e-05,
348
+ "loss": 0.331,
349
+ "step": 308
350
+ },
351
+ {
352
+ "epoch": 4.57,
353
+ "learning_rate": 1.7175572519083968e-05,
354
+ "loss": 0.3131,
355
+ "step": 315
356
+ },
357
+ {
358
+ "epoch": 4.67,
359
+ "learning_rate": 1.685496183206107e-05,
360
+ "loss": 0.3166,
361
+ "step": 322
362
+ },
363
+ {
364
+ "epoch": 4.77,
365
+ "learning_rate": 1.6534351145038168e-05,
366
+ "loss": 0.3555,
367
+ "step": 329
368
+ },
369
+ {
370
+ "epoch": 4.87,
371
+ "learning_rate": 1.6213740458015265e-05,
372
+ "loss": 0.3394,
373
+ "step": 336
374
+ },
375
+ {
376
+ "epoch": 4.97,
377
+ "learning_rate": 1.5893129770992368e-05,
378
+ "loss": 0.3236,
379
+ "step": 343
380
+ },
381
+ {
382
+ "epoch": 5.0,
383
+ "eval_accuracy": 0.6771324599708879,
384
+ "eval_loss": 0.5304832458496094,
385
+ "eval_runtime": 3.2706,
386
+ "eval_samples_per_second": 30.576,
387
+ "eval_steps_per_second": 2.14,
388
+ "step": 345
389
+ },
390
+ {
391
+ "epoch": 5.0,
392
+ "eval_exact_match": 6.0,
393
+ "eval_f1": 12.238095238095239,
394
+ "eval_qa_bleu": 3.673522350616593,
395
+ "eval_qa_exact_match": 0.06,
396
+ "eval_recite_bleu": 28.72242820390079,
397
+ "eval_recite_exact_match": 0.0,
398
+ "step": 345
399
+ },
400
+ {
401
+ "epoch": 5.07,
402
+ "learning_rate": 1.5572519083969465e-05,
403
+ "loss": 0.2619,
404
+ "step": 350
405
+ },
406
+ {
407
+ "epoch": 5.17,
408
+ "learning_rate": 1.5251908396946567e-05,
409
+ "loss": 0.2471,
410
+ "step": 357
411
+ },
412
+ {
413
+ "epoch": 5.28,
414
+ "learning_rate": 1.4931297709923665e-05,
415
+ "loss": 0.2407,
416
+ "step": 364
417
+ },
418
+ {
419
+ "epoch": 5.38,
420
+ "learning_rate": 1.4610687022900764e-05,
421
+ "loss": 0.2503,
422
+ "step": 371
423
+ },
424
+ {
425
+ "epoch": 5.48,
426
+ "learning_rate": 1.4290076335877862e-05,
427
+ "loss": 0.225,
428
+ "step": 378
429
+ },
430
+ {
431
+ "epoch": 5.58,
432
+ "learning_rate": 1.3969465648854963e-05,
433
+ "loss": 0.2486,
434
+ "step": 385
435
+ },
436
+ {
437
+ "epoch": 5.68,
438
+ "learning_rate": 1.3648854961832061e-05,
439
+ "loss": 0.2366,
440
+ "step": 392
441
+ },
442
+ {
443
+ "epoch": 5.78,
444
+ "learning_rate": 1.332824427480916e-05,
445
+ "loss": 0.2371,
446
+ "step": 399
447
+ },
448
+ {
449
+ "epoch": 5.88,
450
+ "learning_rate": 1.300763358778626e-05,
451
+ "loss": 0.2308,
452
+ "step": 406
453
+ },
454
+ {
455
+ "epoch": 5.99,
456
+ "learning_rate": 1.268702290076336e-05,
457
+ "loss": 0.2371,
458
+ "step": 413
459
+ },
460
+ {
461
+ "epoch": 6.0,
462
+ "eval_accuracy": 0.6848471615720524,
463
+ "eval_loss": 0.4593181312084198,
464
+ "eval_runtime": 3.3218,
465
+ "eval_samples_per_second": 30.104,
466
+ "eval_steps_per_second": 2.107,
467
+ "step": 414
468
+ },
469
+ {
470
+ "epoch": 6.0,
471
+ "eval_exact_match": 12.0,
472
+ "eval_f1": 19.94920634920635,
473
+ "eval_qa_bleu": 7.958666643852284,
474
+ "eval_qa_exact_match": 0.12,
475
+ "eval_recite_bleu": 36.347059921630894,
476
+ "eval_recite_exact_match": 0.0,
477
+ "step": 414
478
+ },
479
+ {
480
+ "epoch": 6.09,
481
+ "learning_rate": 1.2366412213740458e-05,
482
+ "loss": 0.1793,
483
+ "step": 420
484
+ },
485
+ {
486
+ "epoch": 6.19,
487
+ "learning_rate": 1.2045801526717557e-05,
488
+ "loss": 0.179,
489
+ "step": 427
490
+ },
491
+ {
492
+ "epoch": 6.29,
493
+ "learning_rate": 1.1725190839694657e-05,
494
+ "loss": 0.1729,
495
+ "step": 434
496
+ },
497
+ {
498
+ "epoch": 6.39,
499
+ "learning_rate": 1.1404580152671756e-05,
500
+ "loss": 0.1748,
501
+ "step": 441
502
+ },
503
+ {
504
+ "epoch": 6.49,
505
+ "learning_rate": 1.1083969465648856e-05,
506
+ "loss": 0.1847,
507
+ "step": 448
508
+ },
509
+ {
510
+ "epoch": 6.59,
511
+ "learning_rate": 1.0763358778625954e-05,
512
+ "loss": 0.1823,
513
+ "step": 455
514
+ },
515
+ {
516
+ "epoch": 6.7,
517
+ "learning_rate": 1.0442748091603054e-05,
518
+ "loss": 0.1825,
519
+ "step": 462
520
+ },
521
+ {
522
+ "epoch": 6.8,
523
+ "learning_rate": 1.0122137404580153e-05,
524
+ "loss": 0.1651,
525
+ "step": 469
526
+ },
527
+ {
528
+ "epoch": 6.9,
529
+ "learning_rate": 9.801526717557251e-06,
530
+ "loss": 0.1675,
531
+ "step": 476
532
+ },
533
+ {
534
+ "epoch": 7.0,
535
+ "learning_rate": 9.480916030534352e-06,
536
+ "loss": 0.1839,
537
+ "step": 483
538
+ },
539
+ {
540
+ "epoch": 7.0,
541
+ "eval_accuracy": 0.6880931586608442,
542
+ "eval_loss": 0.43847352266311646,
543
+ "eval_runtime": 4.5343,
544
+ "eval_samples_per_second": 22.054,
545
+ "eval_steps_per_second": 1.544,
546
+ "step": 483
547
+ },
548
+ {
549
+ "epoch": 7.0,
550
+ "eval_exact_match": 10.0,
551
+ "eval_f1": 16.56111111111111,
552
+ "eval_qa_bleu": 5.445893929820159,
553
+ "eval_qa_exact_match": 0.1,
554
+ "eval_recite_bleu": 38.055647893528544,
555
+ "eval_recite_exact_match": 0.0,
556
+ "step": 483
557
+ },
558
+ {
559
+ "epoch": 7.1,
560
+ "learning_rate": 9.16030534351145e-06,
561
+ "loss": 0.1371,
562
+ "step": 490
563
+ },
564
+ {
565
+ "epoch": 7.2,
566
+ "learning_rate": 8.83969465648855e-06,
567
+ "loss": 0.1461,
568
+ "step": 497
569
+ },
570
+ {
571
+ "epoch": 7.3,
572
+ "learning_rate": 8.519083969465649e-06,
573
+ "loss": 0.1343,
574
+ "step": 504
575
+ },
576
+ {
577
+ "epoch": 7.41,
578
+ "learning_rate": 8.198473282442749e-06,
579
+ "loss": 0.1452,
580
+ "step": 511
581
+ },
582
+ {
583
+ "epoch": 7.51,
584
+ "learning_rate": 7.877862595419847e-06,
585
+ "loss": 0.1415,
586
+ "step": 518
587
+ },
588
+ {
589
+ "epoch": 7.61,
590
+ "learning_rate": 7.557251908396946e-06,
591
+ "loss": 0.162,
592
+ "step": 525
593
+ },
594
+ {
595
+ "epoch": 7.71,
596
+ "learning_rate": 7.236641221374045e-06,
597
+ "loss": 0.1468,
598
+ "step": 532
599
+ },
600
+ {
601
+ "epoch": 7.81,
602
+ "learning_rate": 6.9160305343511455e-06,
603
+ "loss": 0.14,
604
+ "step": 539
605
+ },
606
+ {
607
+ "epoch": 7.91,
608
+ "learning_rate": 6.595419847328244e-06,
609
+ "loss": 0.1287,
610
+ "step": 546
611
+ },
612
+ {
613
+ "epoch": 8.0,
614
+ "eval_accuracy": 0.6898835516739447,
615
+ "eval_loss": 0.42429620027542114,
616
+ "eval_runtime": 3.4153,
617
+ "eval_samples_per_second": 29.28,
618
+ "eval_steps_per_second": 2.05,
619
+ "step": 552
620
+ },
621
+ {
622
+ "epoch": 8.0,
623
+ "eval_exact_match": 17.0,
624
+ "eval_f1": 21.971428571428575,
625
+ "eval_qa_bleu": 22.723779810738534,
626
+ "eval_qa_exact_match": 0.16,
627
+ "eval_recite_bleu": 42.69152358155121,
628
+ "eval_recite_exact_match": 0.01,
629
+ "step": 552
630
+ },
631
+ {
632
+ "epoch": 8.01,
633
+ "learning_rate": 6.274809160305344e-06,
634
+ "loss": 0.1479,
635
+ "step": 553
636
+ },
637
+ {
638
+ "epoch": 8.12,
639
+ "learning_rate": 5.954198473282443e-06,
640
+ "loss": 0.1235,
641
+ "step": 560
642
+ },
643
+ {
644
+ "epoch": 8.22,
645
+ "learning_rate": 5.633587786259543e-06,
646
+ "loss": 0.1341,
647
+ "step": 567
648
+ },
649
+ {
650
+ "epoch": 8.32,
651
+ "learning_rate": 5.312977099236641e-06,
652
+ "loss": 0.1178,
653
+ "step": 574
654
+ },
655
+ {
656
+ "epoch": 8.42,
657
+ "learning_rate": 4.99236641221374e-06,
658
+ "loss": 0.1172,
659
+ "step": 581
660
+ },
661
+ {
662
+ "epoch": 8.52,
663
+ "learning_rate": 4.67175572519084e-06,
664
+ "loss": 0.1313,
665
+ "step": 588
666
+ },
667
+ {
668
+ "epoch": 8.62,
669
+ "learning_rate": 4.3511450381679385e-06,
670
+ "loss": 0.1173,
671
+ "step": 595
672
+ },
673
+ {
674
+ "epoch": 8.72,
675
+ "learning_rate": 4.030534351145039e-06,
676
+ "loss": 0.1148,
677
+ "step": 602
678
+ },
679
+ {
680
+ "epoch": 8.83,
681
+ "learning_rate": 3.709923664122137e-06,
682
+ "loss": 0.1208,
683
+ "step": 609
684
+ },
685
+ {
686
+ "epoch": 8.93,
687
+ "learning_rate": 3.3893129770992365e-06,
688
+ "loss": 0.1241,
689
+ "step": 616
690
+ },
691
+ {
692
+ "epoch": 9.0,
693
+ "eval_accuracy": 0.6904949053857351,
694
+ "eval_loss": 0.4206622242927551,
695
+ "eval_runtime": 3.3293,
696
+ "eval_samples_per_second": 30.036,
697
+ "eval_steps_per_second": 2.103,
698
+ "step": 621
699
+ },
700
+ {
701
+ "epoch": 9.0,
702
+ "eval_exact_match": 14.0,
703
+ "eval_f1": 20.504761904761907,
704
+ "eval_qa_bleu": 13.525989143509026,
705
+ "eval_qa_exact_match": 0.14,
706
+ "eval_recite_bleu": 45.9618562736209,
707
+ "eval_recite_exact_match": 0.02,
708
+ "step": 621
709
+ },
710
+ {
711
+ "epoch": 9.03,
712
+ "learning_rate": 3.068702290076336e-06,
713
+ "loss": 0.122,
714
+ "step": 623
715
+ },
716
+ {
717
+ "epoch": 9.13,
718
+ "learning_rate": 2.748091603053435e-06,
719
+ "loss": 0.1121,
720
+ "step": 630
721
+ },
722
+ {
723
+ "epoch": 9.23,
724
+ "learning_rate": 2.4274809160305345e-06,
725
+ "loss": 0.1127,
726
+ "step": 637
727
+ },
728
+ {
729
+ "epoch": 9.33,
730
+ "learning_rate": 2.106870229007634e-06,
731
+ "loss": 0.1148,
732
+ "step": 644
733
+ },
734
+ {
735
+ "epoch": 9.43,
736
+ "learning_rate": 1.7862595419847328e-06,
737
+ "loss": 0.1123,
738
+ "step": 651
739
+ },
740
+ {
741
+ "epoch": 9.54,
742
+ "learning_rate": 1.4656488549618321e-06,
743
+ "loss": 0.1094,
744
+ "step": 658
745
+ },
746
+ {
747
+ "epoch": 9.64,
748
+ "learning_rate": 1.1450381679389313e-06,
749
+ "loss": 0.117,
750
+ "step": 665
751
+ },
752
+ {
753
+ "epoch": 9.74,
754
+ "learning_rate": 8.244274809160305e-07,
755
+ "loss": 0.118,
756
+ "step": 672
757
+ },
758
+ {
759
+ "epoch": 9.84,
760
+ "learning_rate": 5.038167938931298e-07,
761
+ "loss": 0.1097,
762
+ "step": 679
763
+ },
764
+ {
765
+ "epoch": 9.94,
766
+ "learning_rate": 1.8320610687022902e-07,
767
+ "loss": 0.1198,
768
+ "step": 686
769
+ },
770
+ {
771
+ "epoch": 10.0,
772
+ "eval_accuracy": 0.6908442503639011,
773
+ "eval_loss": 0.4198664426803589,
774
+ "eval_runtime": 3.7502,
775
+ "eval_samples_per_second": 26.666,
776
+ "eval_steps_per_second": 1.867,
777
+ "step": 690
778
+ },
779
+ {
780
+ "epoch": 10.0,
781
+ "eval_exact_match": 19.0,
782
+ "eval_f1": 24.855555555555558,
783
+ "eval_qa_bleu": 14.403045967487259,
784
+ "eval_qa_exact_match": 0.18,
785
+ "eval_recite_bleu": 45.62406704093971,
786
+ "eval_recite_exact_match": 0.02,
787
+ "step": 690
788
+ },
789
+ {
790
+ "epoch": 10.0,
791
+ "step": 690,
792
+ "total_flos": 3.18562653216768e+16,
793
+ "train_loss": 0.7305109727641811,
794
+ "train_runtime": 1813.2807,
795
+ "train_samples_per_second": 6.05,
796
+ "train_steps_per_second": 0.381
797
+ }
798
+ ],
799
+ "logging_steps": 7,
800
+ "max_steps": 690,
801
+ "num_train_epochs": 10,
802
+ "save_steps": 500,
803
+ "total_flos": 3.18562653216768e+16,
804
+ "trial_name": null,
805
+ "trial_params": null
806
+ }