sandernotenbaert commited on
Commit
04958f7
·
verified ·
1 Parent(s): 3cdb050

Training in progress, step 1300, checkpoint

Browse files
last-checkpoint/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MistralForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "head_dim": null,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 256,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 512,
13
+ "max_position_embeddings": 1024,
14
+ "model_type": "mistral",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 2,
17
+ "num_key_value_heads": 4,
18
+ "pad_token_id": 0,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_theta": 10000.0,
21
+ "sliding_window": 256,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.52.3",
25
+ "use_cache": true,
26
+ "vocab_size": 30000
27
+ }
last-checkpoint/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.52.3"
7
+ }
last-checkpoint/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeb256257e095023202df2821297a8d339ad1834d968399938dd0c2e0d61ddb0
3
+ size 66690264
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ef34edc57783b5d2ff75e949910673211e6c4ca385376406335fa83c91745cd
3
+ size 133393631
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4569729d12b4ffcd3c0004d02f4f36833455efd52f16ae44ecb408092e8532c9
3
+ size 13990
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e617b832469a9a84668626e0822faae0a42147d51b13c5698eb75b59b231f4f
3
+ size 1064
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 9.911575317382812,
4
+ "best_model_checkpoint": "./models/v-001/checkpoint-1000",
5
+ "epoch": 41.97826086956522,
6
+ "eval_steps": 100,
7
+ "global_step": 1300,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.6521739130434783,
14
+ "grad_norm": 0.5281125903129578,
15
+ "learning_rate": 3.166666666666667e-06,
16
+ "loss": 10.3483,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 1.2934782608695652,
21
+ "grad_norm": 0.6133605241775513,
22
+ "learning_rate": 6.5000000000000004e-06,
23
+ "loss": 10.3417,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 1.9456521739130435,
28
+ "grad_norm": 0.6125457882881165,
29
+ "learning_rate": 9.833333333333333e-06,
30
+ "loss": 10.3299,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 2.5869565217391304,
35
+ "grad_norm": 0.5962333679199219,
36
+ "learning_rate": 1.3166666666666665e-05,
37
+ "loss": 10.3064,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 3.2282608695652173,
42
+ "grad_norm": 0.6132860779762268,
43
+ "learning_rate": 1.65e-05,
44
+ "loss": 10.2727,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 3.2282608695652173,
49
+ "eval_accuracy": 4.39651795777744e-05,
50
+ "eval_loss": 10.328398704528809,
51
+ "eval_runtime": 17.8533,
52
+ "eval_samples_per_second": 16.916,
53
+ "eval_steps_per_second": 1.064,
54
+ "step": 100
55
+ },
56
+ {
57
+ "epoch": 3.880434782608696,
58
+ "grad_norm": 0.5671436190605164,
59
+ "learning_rate": 1.9833333333333335e-05,
60
+ "loss": 10.2083,
61
+ "step": 120
62
+ },
63
+ {
64
+ "epoch": 4.521739130434782,
65
+ "grad_norm": 0.4685352146625519,
66
+ "learning_rate": 2.3166666666666666e-05,
67
+ "loss": 10.1064,
68
+ "step": 140
69
+ },
70
+ {
71
+ "epoch": 5.163043478260869,
72
+ "grad_norm": 0.45952484011650085,
73
+ "learning_rate": 2.6500000000000004e-05,
74
+ "loss": 9.9909,
75
+ "step": 160
76
+ },
77
+ {
78
+ "epoch": 5.815217391304348,
79
+ "grad_norm": 0.41472023725509644,
80
+ "learning_rate": 2.9833333333333335e-05,
81
+ "loss": 9.8718,
82
+ "step": 180
83
+ },
84
+ {
85
+ "epoch": 6.456521739130435,
86
+ "grad_norm": 0.43127089738845825,
87
+ "learning_rate": 3.316666666666667e-05,
88
+ "loss": 9.7582,
89
+ "step": 200
90
+ },
91
+ {
92
+ "epoch": 6.456521739130435,
93
+ "eval_accuracy": 0.0026139297676240417,
94
+ "eval_loss": 10.096575736999512,
95
+ "eval_runtime": 35.2536,
96
+ "eval_samples_per_second": 8.566,
97
+ "eval_steps_per_second": 0.539,
98
+ "step": 200
99
+ },
100
+ {
101
+ "epoch": 7.0978260869565215,
102
+ "grad_norm": 0.4042549431324005,
103
+ "learning_rate": 3.65e-05,
104
+ "loss": 9.6266,
105
+ "step": 220
106
+ },
107
+ {
108
+ "epoch": 7.75,
109
+ "grad_norm": 0.3853429853916168,
110
+ "learning_rate": 3.983333333333333e-05,
111
+ "loss": 9.5189,
112
+ "step": 240
113
+ },
114
+ {
115
+ "epoch": 8.391304347826088,
116
+ "grad_norm": 0.38628196716308594,
117
+ "learning_rate": 4.316666666666667e-05,
118
+ "loss": 9.4044,
119
+ "step": 260
120
+ },
121
+ {
122
+ "epoch": 9.032608695652174,
123
+ "grad_norm": 0.43260782957077026,
124
+ "learning_rate": 4.6500000000000005e-05,
125
+ "loss": 9.3052,
126
+ "step": 280
127
+ },
128
+ {
129
+ "epoch": 9.684782608695652,
130
+ "grad_norm": 0.4363991916179657,
131
+ "learning_rate": 4.9833333333333336e-05,
132
+ "loss": 9.2052,
133
+ "step": 300
134
+ },
135
+ {
136
+ "epoch": 9.684782608695652,
137
+ "eval_accuracy": 0.003677087746504768,
138
+ "eval_loss": 9.951318740844727,
139
+ "eval_runtime": 47.0129,
140
+ "eval_samples_per_second": 6.424,
141
+ "eval_steps_per_second": 0.404,
142
+ "step": 300
143
+ },
144
+ {
145
+ "epoch": 10.326086956521738,
146
+ "grad_norm": 0.49219855666160583,
147
+ "learning_rate": 5.316666666666667e-05,
148
+ "loss": 9.1327,
149
+ "step": 320
150
+ },
151
+ {
152
+ "epoch": 10.978260869565217,
153
+ "grad_norm": 0.4471158981323242,
154
+ "learning_rate": 5.65e-05,
155
+ "loss": 9.0328,
156
+ "step": 340
157
+ },
158
+ {
159
+ "epoch": 11.619565217391305,
160
+ "grad_norm": 0.3913232684135437,
161
+ "learning_rate": 5.983333333333334e-05,
162
+ "loss": 8.9514,
163
+ "step": 360
164
+ },
165
+ {
166
+ "epoch": 12.26086956521739,
167
+ "grad_norm": 1.0882291793823242,
168
+ "learning_rate": 6.316666666666668e-05,
169
+ "loss": 8.8916,
170
+ "step": 380
171
+ },
172
+ {
173
+ "epoch": 12.91304347826087,
174
+ "grad_norm": 0.6010486483573914,
175
+ "learning_rate": 6.65e-05,
176
+ "loss": 8.8216,
177
+ "step": 400
178
+ },
179
+ {
180
+ "epoch": 12.91304347826087,
181
+ "eval_accuracy": 0.003413296669038122,
182
+ "eval_loss": 9.953831672668457,
183
+ "eval_runtime": 36.8026,
184
+ "eval_samples_per_second": 8.206,
185
+ "eval_steps_per_second": 0.516,
186
+ "step": 400
187
+ },
188
+ {
189
+ "epoch": 13.554347826086957,
190
+ "grad_norm": 0.49556687474250793,
191
+ "learning_rate": 6.983333333333334e-05,
192
+ "loss": 8.7406,
193
+ "step": 420
194
+ },
195
+ {
196
+ "epoch": 14.195652173913043,
197
+ "grad_norm": 0.495381623506546,
198
+ "learning_rate": 7.316666666666668e-05,
199
+ "loss": 8.679,
200
+ "step": 440
201
+ },
202
+ {
203
+ "epoch": 14.847826086956522,
204
+ "grad_norm": 0.6165482401847839,
205
+ "learning_rate": 7.65e-05,
206
+ "loss": 8.5904,
207
+ "step": 460
208
+ },
209
+ {
210
+ "epoch": 15.48913043478261,
211
+ "grad_norm": 0.5654007792472839,
212
+ "learning_rate": 7.983333333333334e-05,
213
+ "loss": 8.4896,
214
+ "step": 480
215
+ },
216
+ {
217
+ "epoch": 16.130434782608695,
218
+ "grad_norm": 0.6611935496330261,
219
+ "learning_rate": 8.316666666666666e-05,
220
+ "loss": 8.406,
221
+ "step": 500
222
+ },
223
+ {
224
+ "epoch": 16.130434782608695,
225
+ "eval_accuracy": 0.0029496638662179554,
226
+ "eval_loss": 9.952414512634277,
227
+ "eval_runtime": 39.649,
228
+ "eval_samples_per_second": 7.617,
229
+ "eval_steps_per_second": 0.479,
230
+ "step": 500
231
+ },
232
+ {
233
+ "epoch": 16.782608695652176,
234
+ "grad_norm": 0.7537912726402283,
235
+ "learning_rate": 8.65e-05,
236
+ "loss": 8.2948,
237
+ "step": 520
238
+ },
239
+ {
240
+ "epoch": 17.42391304347826,
241
+ "grad_norm": 0.9145230650901794,
242
+ "learning_rate": 8.983333333333334e-05,
243
+ "loss": 8.1992,
244
+ "step": 540
245
+ },
246
+ {
247
+ "epoch": 18.065217391304348,
248
+ "grad_norm": 0.810655415058136,
249
+ "learning_rate": 9.316666666666666e-05,
250
+ "loss": 8.08,
251
+ "step": 560
252
+ },
253
+ {
254
+ "epoch": 18.717391304347824,
255
+ "grad_norm": 0.9121057987213135,
256
+ "learning_rate": 9.65e-05,
257
+ "loss": 7.9438,
258
+ "step": 580
259
+ },
260
+ {
261
+ "epoch": 19.358695652173914,
262
+ "grad_norm": 0.8612993359565735,
263
+ "learning_rate": 9.983333333333334e-05,
264
+ "loss": 7.8326,
265
+ "step": 600
266
+ },
267
+ {
268
+ "epoch": 19.358695652173914,
269
+ "eval_accuracy": 0.0021423032957897346,
270
+ "eval_loss": 9.945837020874023,
271
+ "eval_runtime": 25.9715,
272
+ "eval_samples_per_second": 11.628,
273
+ "eval_steps_per_second": 0.732,
274
+ "step": 600
275
+ },
276
+ {
277
+ "epoch": 20.0,
278
+ "grad_norm": 0.8960981369018555,
279
+ "learning_rate": 9.995456138403733e-05,
280
+ "loss": 7.716,
281
+ "step": 620
282
+ },
283
+ {
284
+ "epoch": 20.652173913043477,
285
+ "grad_norm": 0.9986662268638611,
286
+ "learning_rate": 9.980864681729001e-05,
287
+ "loss": 7.5692,
288
+ "step": 640
289
+ },
290
+ {
291
+ "epoch": 21.293478260869566,
292
+ "grad_norm": 0.9232766628265381,
293
+ "learning_rate": 9.956242426451834e-05,
294
+ "loss": 7.4208,
295
+ "step": 660
296
+ },
297
+ {
298
+ "epoch": 21.945652173913043,
299
+ "grad_norm": 0.9031963348388672,
300
+ "learning_rate": 9.921638958517565e-05,
301
+ "loss": 7.3481,
302
+ "step": 680
303
+ },
304
+ {
305
+ "epoch": 22.58695652173913,
306
+ "grad_norm": 0.9567400813102722,
307
+ "learning_rate": 9.877123964705497e-05,
308
+ "loss": 7.1956,
309
+ "step": 700
310
+ },
311
+ {
312
+ "epoch": 22.58695652173913,
313
+ "eval_accuracy": 0.001666679989448357,
314
+ "eval_loss": 9.986405372619629,
315
+ "eval_runtime": 57.7614,
316
+ "eval_samples_per_second": 5.228,
317
+ "eval_steps_per_second": 0.329,
318
+ "step": 700
319
+ },
320
+ {
321
+ "epoch": 23.22826086956522,
322
+ "grad_norm": 1.135650873184204,
323
+ "learning_rate": 9.822787092288991e-05,
324
+ "loss": 7.0604,
325
+ "step": 720
326
+ },
327
+ {
328
+ "epoch": 23.880434782608695,
329
+ "grad_norm": 0.8771520853042603,
330
+ "learning_rate": 9.758737768497802e-05,
331
+ "loss": 6.9215,
332
+ "step": 740
333
+ },
334
+ {
335
+ "epoch": 24.52173913043478,
336
+ "grad_norm": 1.0156564712524414,
337
+ "learning_rate": 9.685104980146193e-05,
338
+ "loss": 6.8363,
339
+ "step": 760
340
+ },
341
+ {
342
+ "epoch": 25.16304347826087,
343
+ "grad_norm": 0.9963734149932861,
344
+ "learning_rate": 9.60203701387066e-05,
345
+ "loss": 6.6577,
346
+ "step": 780
347
+ },
348
+ {
349
+ "epoch": 25.815217391304348,
350
+ "grad_norm": 0.8723818063735962,
351
+ "learning_rate": 9.509701157500376e-05,
352
+ "loss": 6.5659,
353
+ "step": 800
354
+ },
355
+ {
356
+ "epoch": 25.815217391304348,
357
+ "eval_accuracy": 0.0014988129401514,
358
+ "eval_loss": 9.925810813903809,
359
+ "eval_runtime": 49.4048,
360
+ "eval_samples_per_second": 6.113,
361
+ "eval_steps_per_second": 0.385,
362
+ "step": 800
363
+ },
364
+ {
365
+ "epoch": 26.456521739130434,
366
+ "grad_norm": 1.1702412366867065,
367
+ "learning_rate": 9.408283363161774e-05,
368
+ "loss": 6.393,
369
+ "step": 820
370
+ },
371
+ {
372
+ "epoch": 27.097826086956523,
373
+ "grad_norm": 0.8747526407241821,
374
+ "learning_rate": 9.297987872795705e-05,
375
+ "loss": 6.3074,
376
+ "step": 840
377
+ },
378
+ {
379
+ "epoch": 27.75,
380
+ "grad_norm": 0.973866879940033,
381
+ "learning_rate": 9.179036806841353e-05,
382
+ "loss": 6.1801,
383
+ "step": 860
384
+ },
385
+ {
386
+ "epoch": 28.391304347826086,
387
+ "grad_norm": 1.585481882095337,
388
+ "learning_rate": 9.051669716915227e-05,
389
+ "loss": 6.1175,
390
+ "step": 880
391
+ },
392
+ {
393
+ "epoch": 29.032608695652176,
394
+ "grad_norm": 1.0919766426086426,
395
+ "learning_rate": 8.916143103386093e-05,
396
+ "loss": 5.9719,
397
+ "step": 900
398
+ },
399
+ {
400
+ "epoch": 29.032608695652176,
401
+ "eval_accuracy": 0.001458844595080696,
402
+ "eval_loss": 9.97097396850586,
403
+ "eval_runtime": 46.643,
404
+ "eval_samples_per_second": 6.475,
405
+ "eval_steps_per_second": 0.407,
406
+ "step": 900
407
+ },
408
+ {
409
+ "epoch": 29.684782608695652,
410
+ "grad_norm": 1.173614501953125,
411
+ "learning_rate": 8.77272989881736e-05,
412
+ "loss": 5.8702,
413
+ "step": 920
414
+ },
415
+ {
416
+ "epoch": 30.32608695652174,
417
+ "grad_norm": 1.0350476503372192,
418
+ "learning_rate": 8.621718918317225e-05,
419
+ "loss": 5.704,
420
+ "step": 940
421
+ },
422
+ {
423
+ "epoch": 30.97826086956522,
424
+ "grad_norm": 1.1128321886062622,
425
+ "learning_rate": 8.463414277903475e-05,
426
+ "loss": 5.6413,
427
+ "step": 960
428
+ },
429
+ {
430
+ "epoch": 31.619565217391305,
431
+ "grad_norm": 1.2460695505142212,
432
+ "learning_rate": 8.298134782054305e-05,
433
+ "loss": 5.4948,
434
+ "step": 980
435
+ },
436
+ {
437
+ "epoch": 32.26086956521739,
438
+ "grad_norm": 1.1606298685073853,
439
+ "learning_rate": 8.126213281678526e-05,
440
+ "loss": 5.4031,
441
+ "step": 1000
442
+ },
443
+ {
444
+ "epoch": 32.26086956521739,
445
+ "eval_accuracy": 0.001079145316909008,
446
+ "eval_loss": 9.911575317382812,
447
+ "eval_runtime": 48.65,
448
+ "eval_samples_per_second": 6.208,
449
+ "eval_steps_per_second": 0.391,
450
+ "step": 1000
451
+ },
452
+ {
453
+ "epoch": 32.91304347826087,
454
+ "grad_norm": 1.0351324081420898,
455
+ "learning_rate": 7.94799600379813e-05,
456
+ "loss": 5.3272,
457
+ "step": 1020
458
+ },
459
+ {
460
+ "epoch": 33.55434782608695,
461
+ "grad_norm": 1.089340329170227,
462
+ "learning_rate": 7.763841854293145e-05,
463
+ "loss": 5.2996,
464
+ "step": 1040
465
+ },
466
+ {
467
+ "epoch": 34.19565217391305,
468
+ "grad_norm": 1.3611856698989868,
469
+ "learning_rate": 7.574121695112954e-05,
470
+ "loss": 5.1266,
471
+ "step": 1060
472
+ },
473
+ {
474
+ "epoch": 34.84782608695652,
475
+ "grad_norm": 1.2501380443572998,
476
+ "learning_rate": 7.379217597409688e-05,
477
+ "loss": 5.0434,
478
+ "step": 1080
479
+ },
480
+ {
481
+ "epoch": 35.48913043478261,
482
+ "grad_norm": 1.057522177696228,
483
+ "learning_rate": 7.179522072097774e-05,
484
+ "loss": 4.9784,
485
+ "step": 1100
486
+ },
487
+ {
488
+ "epoch": 35.48913043478261,
489
+ "eval_accuracy": 0.0011830630140928385,
490
+ "eval_loss": 9.981914520263672,
491
+ "eval_runtime": 106.635,
492
+ "eval_samples_per_second": 2.832,
493
+ "eval_steps_per_second": 0.178,
494
+ "step": 1100
495
+ },
496
+ {
497
+ "epoch": 36.130434782608695,
498
+ "grad_norm": 1.1768474578857422,
499
+ "learning_rate": 6.975437279389181e-05,
500
+ "loss": 4.9012,
501
+ "step": 1120
502
+ },
503
+ {
504
+ "epoch": 36.78260869565217,
505
+ "grad_norm": 1.1783802509307861,
506
+ "learning_rate": 6.767374218896286e-05,
507
+ "loss": 4.823,
508
+ "step": 1140
509
+ },
510
+ {
511
+ "epoch": 37.42391304347826,
512
+ "grad_norm": 1.220082402229309,
513
+ "learning_rate": 6.555751901933342e-05,
514
+ "loss": 4.7149,
515
+ "step": 1160
516
+ },
517
+ {
518
+ "epoch": 38.06521739130435,
519
+ "grad_norm": 1.3078495264053345,
520
+ "learning_rate": 6.340996507683458e-05,
521
+ "loss": 4.6413,
522
+ "step": 1180
523
+ },
524
+ {
525
+ "epoch": 38.71739130434783,
526
+ "grad_norm": 1.2146966457366943,
527
+ "learning_rate": 6.123540524930442e-05,
528
+ "loss": 4.6684,
529
+ "step": 1200
530
+ },
531
+ {
532
+ "epoch": 38.71739130434783,
533
+ "eval_accuracy": 0.0008952909295837696,
534
+ "eval_loss": 10.014237403869629,
535
+ "eval_runtime": 31.4866,
536
+ "eval_samples_per_second": 9.591,
537
+ "eval_steps_per_second": 0.603,
538
+ "step": 1200
539
+ },
540
+ {
541
+ "epoch": 39.391304347826086,
542
+ "grad_norm": 1.090649962425232,
543
+ "learning_rate": 5.903821881083942e-05,
544
+ "loss": 4.7794,
545
+ "step": 1220
546
+ },
547
+ {
548
+ "epoch": 40.03260869565217,
549
+ "grad_norm": 1.2051453590393066,
550
+ "learning_rate": 5.682283060251932e-05,
551
+ "loss": 4.4631,
552
+ "step": 1240
553
+ },
554
+ {
555
+ "epoch": 40.68478260869565,
556
+ "grad_norm": 1.0512608289718628,
557
+ "learning_rate": 5.4593702121365955e-05,
558
+ "loss": 4.4119,
559
+ "step": 1260
560
+ },
561
+ {
562
+ "epoch": 41.32608695652174,
563
+ "grad_norm": 0.9912136793136597,
564
+ "learning_rate": 5.235532253548213e-05,
565
+ "loss": 4.3377,
566
+ "step": 1280
567
+ },
568
+ {
569
+ "epoch": 41.97826086956522,
570
+ "grad_norm": 1.0219991207122803,
571
+ "learning_rate": 5.0112199643464376e-05,
572
+ "loss": 4.3184,
573
+ "step": 1300
574
+ },
575
+ {
576
+ "epoch": 41.97826086956522,
577
+ "eval_accuracy": 0.001019192799302952,
578
+ "eval_loss": 10.048251152038574,
579
+ "eval_runtime": 18.0166,
580
+ "eval_samples_per_second": 16.762,
581
+ "eval_steps_per_second": 1.055,
582
+ "step": 1300
583
+ }
584
+ ],
585
+ "logging_steps": 20,
586
+ "max_steps": 2000,
587
+ "num_input_tokens_seen": 0,
588
+ "num_train_epochs": 65,
589
+ "save_steps": 100,
590
+ "stateful_callbacks": {
591
+ "TrainerControl": {
592
+ "args": {
593
+ "should_epoch_stop": false,
594
+ "should_evaluate": false,
595
+ "should_log": false,
596
+ "should_save": true,
597
+ "should_training_stop": false
598
+ },
599
+ "attributes": {}
600
+ }
601
+ },
602
+ "total_flos": 1697662857792000.0,
603
+ "train_batch_size": 8,
604
+ "trial_name": null,
605
+ "trial_params": null
606
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e652f371cbe27456b95b1d9e9643f9465e54ffa6734a66c5db4740dea53b2296
3
+ size 5368