Nessii013 commited on
Commit
7334af8
·
1 Parent(s): 4584af8

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +1021 -0
trainer_state.json ADDED
@@ -0,0 +1,1021 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 50,
6
+ "global_step": 485,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008247422680412371,
13
+ "grad_norm": 16.375,
14
+ "learning_rate": 6.666666666666667e-06,
15
+ "loss": 0.8858,
16
+ "num_input_tokens_seen": 1413808,
17
+ "step": 4
18
+ },
19
+ {
20
+ "epoch": 0.016494845360824743,
21
+ "grad_norm": 4.1875,
22
+ "learning_rate": 1.3333333333333333e-05,
23
+ "loss": 0.4088,
24
+ "num_input_tokens_seen": 2866496,
25
+ "step": 8
26
+ },
27
+ {
28
+ "epoch": 0.024742268041237112,
29
+ "grad_norm": 3.03125,
30
+ "learning_rate": 2e-05,
31
+ "loss": 0.2731,
32
+ "num_input_tokens_seen": 4305104,
33
+ "step": 12
34
+ },
35
+ {
36
+ "epoch": 0.032989690721649485,
37
+ "grad_norm": 1.6484375,
38
+ "learning_rate": 2.6666666666666667e-05,
39
+ "loss": 0.2563,
40
+ "num_input_tokens_seen": 5594128,
41
+ "step": 16
42
+ },
43
+ {
44
+ "epoch": 0.041237113402061855,
45
+ "grad_norm": 1.46875,
46
+ "learning_rate": 3.3333333333333335e-05,
47
+ "loss": 0.2536,
48
+ "num_input_tokens_seen": 6683376,
49
+ "step": 20
50
+ },
51
+ {
52
+ "epoch": 0.049484536082474224,
53
+ "grad_norm": 1.2421875,
54
+ "learning_rate": 4e-05,
55
+ "loss": 0.229,
56
+ "num_input_tokens_seen": 8030336,
57
+ "step": 24
58
+ },
59
+ {
60
+ "epoch": 0.0577319587628866,
61
+ "grad_norm": 1.2578125,
62
+ "learning_rate": 3.9992569962849926e-05,
63
+ "loss": 0.2212,
64
+ "num_input_tokens_seen": 9395728,
65
+ "step": 28
66
+ },
67
+ {
68
+ "epoch": 0.06597938144329897,
69
+ "grad_norm": 1.3671875,
70
+ "learning_rate": 3.99702853719449e-05,
71
+ "loss": 0.2275,
72
+ "num_input_tokens_seen": 10689344,
73
+ "step": 32
74
+ },
75
+ {
76
+ "epoch": 0.07422680412371134,
77
+ "grad_norm": 1.328125,
78
+ "learning_rate": 3.9933162784818745e-05,
79
+ "loss": 0.2262,
80
+ "num_input_tokens_seen": 11936704,
81
+ "step": 36
82
+ },
83
+ {
84
+ "epoch": 0.08247422680412371,
85
+ "grad_norm": 1.484375,
86
+ "learning_rate": 3.988122978369162e-05,
87
+ "loss": 0.2254,
88
+ "num_input_tokens_seen": 13217248,
89
+ "step": 40
90
+ },
91
+ {
92
+ "epoch": 0.09072164948453608,
93
+ "grad_norm": 1.15625,
94
+ "learning_rate": 3.981452495497628e-05,
95
+ "loss": 0.2186,
96
+ "num_input_tokens_seen": 14587328,
97
+ "step": 44
98
+ },
99
+ {
100
+ "epoch": 0.09896907216494845,
101
+ "grad_norm": 1.109375,
102
+ "learning_rate": 3.973309786060829e-05,
103
+ "loss": 0.1971,
104
+ "num_input_tokens_seen": 15976464,
105
+ "step": 48
106
+ },
107
+ {
108
+ "epoch": 0.10721649484536082,
109
+ "grad_norm": 1.125,
110
+ "learning_rate": 3.963700900122124e-05,
111
+ "loss": 0.2231,
112
+ "num_input_tokens_seen": 17262576,
113
+ "step": 52
114
+ },
115
+ {
116
+ "epoch": 0.1154639175257732,
117
+ "grad_norm": 0.9765625,
118
+ "learning_rate": 3.952632977119465e-05,
119
+ "loss": 0.2029,
120
+ "num_input_tokens_seen": 18801264,
121
+ "step": 56
122
+ },
123
+ {
124
+ "epoch": 0.12371134020618557,
125
+ "grad_norm": 1.0625,
126
+ "learning_rate": 3.9401142405607594e-05,
127
+ "loss": 0.2033,
128
+ "num_input_tokens_seen": 20158000,
129
+ "step": 60
130
+ },
131
+ {
132
+ "epoch": 0.13195876288659794,
133
+ "grad_norm": 1.09375,
134
+ "learning_rate": 3.9261539919137776e-05,
135
+ "loss": 0.2278,
136
+ "num_input_tokens_seen": 21322240,
137
+ "step": 64
138
+ },
139
+ {
140
+ "epoch": 0.1402061855670103,
141
+ "grad_norm": 1.0,
142
+ "learning_rate": 3.9107626036951266e-05,
143
+ "loss": 0.1998,
144
+ "num_input_tokens_seen": 22631360,
145
+ "step": 68
146
+ },
147
+ {
148
+ "epoch": 0.14845360824742268,
149
+ "grad_norm": 1.109375,
150
+ "learning_rate": 3.8939515117634326e-05,
151
+ "loss": 0.2148,
152
+ "num_input_tokens_seen": 23848496,
153
+ "step": 72
154
+ },
155
+ {
156
+ "epoch": 0.15670103092783505,
157
+ "grad_norm": 1.1484375,
158
+ "learning_rate": 3.875733206822452e-05,
159
+ "loss": 0.2246,
160
+ "num_input_tokens_seen": 25148336,
161
+ "step": 76
162
+ },
163
+ {
164
+ "epoch": 0.16494845360824742,
165
+ "grad_norm": 1.203125,
166
+ "learning_rate": 3.8561212251404406e-05,
167
+ "loss": 0.2056,
168
+ "num_input_tokens_seen": 26427264,
169
+ "step": 80
170
+ },
171
+ {
172
+ "epoch": 0.1731958762886598,
173
+ "grad_norm": 1.1328125,
174
+ "learning_rate": 3.835130138492644e-05,
175
+ "loss": 0.203,
176
+ "num_input_tokens_seen": 27833024,
177
+ "step": 84
178
+ },
179
+ {
180
+ "epoch": 0.18144329896907216,
181
+ "grad_norm": 1.1015625,
182
+ "learning_rate": 3.812775543334425e-05,
183
+ "loss": 0.1912,
184
+ "num_input_tokens_seen": 29273008,
185
+ "step": 88
186
+ },
187
+ {
188
+ "epoch": 0.18969072164948453,
189
+ "grad_norm": 1.2265625,
190
+ "learning_rate": 3.789074049213033e-05,
191
+ "loss": 0.2182,
192
+ "num_input_tokens_seen": 30624112,
193
+ "step": 92
194
+ },
195
+ {
196
+ "epoch": 0.1979381443298969,
197
+ "grad_norm": 1.1796875,
198
+ "learning_rate": 3.7640432664266514e-05,
199
+ "loss": 0.216,
200
+ "num_input_tokens_seen": 31857552,
201
+ "step": 96
202
+ },
203
+ {
204
+ "epoch": 0.20618556701030927,
205
+ "grad_norm": 1.125,
206
+ "learning_rate": 3.737701792939881e-05,
207
+ "loss": 0.2065,
208
+ "num_input_tokens_seen": 33116768,
209
+ "step": 100
210
+ },
211
+ {
212
+ "epoch": 0.21443298969072164,
213
+ "grad_norm": 1.0,
214
+ "learning_rate": 3.7100692005653796e-05,
215
+ "loss": 0.206,
216
+ "num_input_tokens_seen": 34461024,
217
+ "step": 104
218
+ },
219
+ {
220
+ "epoch": 0.22268041237113403,
221
+ "grad_norm": 1.0703125,
222
+ "learning_rate": 3.681166020421938e-05,
223
+ "loss": 0.1912,
224
+ "num_input_tokens_seen": 35915264,
225
+ "step": 108
226
+ },
227
+ {
228
+ "epoch": 0.2309278350515464,
229
+ "grad_norm": 1.0625,
230
+ "learning_rate": 3.6510137276797786e-05,
231
+ "loss": 0.1952,
232
+ "num_input_tokens_seen": 37264080,
233
+ "step": 112
234
+ },
235
+ {
236
+ "epoch": 0.23917525773195877,
237
+ "grad_norm": 1.109375,
238
+ "learning_rate": 3.6196347256044236e-05,
239
+ "loss": 0.2273,
240
+ "num_input_tokens_seen": 38539072,
241
+ "step": 116
242
+ },
243
+ {
244
+ "epoch": 0.24742268041237114,
245
+ "grad_norm": 1.109375,
246
+ "learning_rate": 3.5870523289109886e-05,
247
+ "loss": 0.2041,
248
+ "num_input_tokens_seen": 39930480,
249
+ "step": 120
250
+ },
251
+ {
252
+ "epoch": 0.2556701030927835,
253
+ "grad_norm": 1.15625,
254
+ "learning_rate": 3.553290746441261e-05,
255
+ "loss": 0.2065,
256
+ "num_input_tokens_seen": 41066544,
257
+ "step": 124
258
+ },
259
+ {
260
+ "epoch": 0.2639175257731959,
261
+ "grad_norm": 1.0390625,
262
+ "learning_rate": 3.5183750631764406e-05,
263
+ "loss": 0.1979,
264
+ "num_input_tokens_seen": 42372160,
265
+ "step": 128
266
+ },
267
+ {
268
+ "epoch": 0.2721649484536082,
269
+ "grad_norm": 1.0703125,
270
+ "learning_rate": 3.4823312215989046e-05,
271
+ "loss": 0.2079,
272
+ "num_input_tokens_seen": 43644832,
273
+ "step": 132
274
+ },
275
+ {
276
+ "epoch": 0.2804123711340206,
277
+ "grad_norm": 1.0078125,
278
+ "learning_rate": 3.445186002416849e-05,
279
+ "loss": 0.2058,
280
+ "num_input_tokens_seen": 44948816,
281
+ "step": 136
282
+ },
283
+ {
284
+ "epoch": 0.28865979381443296,
285
+ "grad_norm": 1.09375,
286
+ "learning_rate": 3.4069670046661197e-05,
287
+ "loss": 0.1857,
288
+ "num_input_tokens_seen": 46404048,
289
+ "step": 140
290
+ },
291
+ {
292
+ "epoch": 0.29690721649484536,
293
+ "grad_norm": 1.1015625,
294
+ "learning_rate": 3.3677026252040306e-05,
295
+ "loss": 0.212,
296
+ "num_input_tokens_seen": 47646208,
297
+ "step": 144
298
+ },
299
+ {
300
+ "epoch": 0.30515463917525776,
301
+ "grad_norm": 1.0234375,
302
+ "learning_rate": 3.327422037610389e-05,
303
+ "loss": 0.1983,
304
+ "num_input_tokens_seen": 49010928,
305
+ "step": 148
306
+ },
307
+ {
308
+ "epoch": 0.3134020618556701,
309
+ "grad_norm": 0.9375,
310
+ "learning_rate": 3.286155170511419e-05,
311
+ "loss": 0.197,
312
+ "num_input_tokens_seen": 50440128,
313
+ "step": 152
314
+ },
315
+ {
316
+ "epoch": 0.3216494845360825,
317
+ "grad_norm": 1.140625,
318
+ "learning_rate": 3.2439326853426824e-05,
319
+ "loss": 0.2028,
320
+ "num_input_tokens_seen": 51797840,
321
+ "step": 156
322
+ },
323
+ {
324
+ "epoch": 0.32989690721649484,
325
+ "grad_norm": 0.984375,
326
+ "learning_rate": 3.200785953567517e-05,
327
+ "loss": 0.196,
328
+ "num_input_tokens_seen": 53109456,
329
+ "step": 160
330
+ },
331
+ {
332
+ "epoch": 0.33814432989690724,
333
+ "grad_norm": 1.0390625,
334
+ "learning_rate": 3.156747033367922e-05,
335
+ "loss": 0.2016,
336
+ "num_input_tokens_seen": 54440768,
337
+ "step": 164
338
+ },
339
+ {
340
+ "epoch": 0.3463917525773196,
341
+ "grad_norm": 0.97265625,
342
+ "learning_rate": 3.1118486458252094e-05,
343
+ "loss": 0.1975,
344
+ "num_input_tokens_seen": 55879424,
345
+ "step": 168
346
+ },
347
+ {
348
+ "epoch": 0.354639175257732,
349
+ "grad_norm": 1.0234375,
350
+ "learning_rate": 3.0661241506081236e-05,
351
+ "loss": 0.1965,
352
+ "num_input_tokens_seen": 57154384,
353
+ "step": 172
354
+ },
355
+ {
356
+ "epoch": 0.3628865979381443,
357
+ "grad_norm": 0.95703125,
358
+ "learning_rate": 3.019607521186475e-05,
359
+ "loss": 0.2078,
360
+ "num_input_tokens_seen": 58470672,
361
+ "step": 176
362
+ },
363
+ {
364
+ "epoch": 0.3711340206185567,
365
+ "grad_norm": 1.0625,
366
+ "learning_rate": 2.972333319588736e-05,
367
+ "loss": 0.2092,
368
+ "num_input_tokens_seen": 59684416,
369
+ "step": 180
370
+ },
371
+ {
372
+ "epoch": 0.37938144329896906,
373
+ "grad_norm": 0.96484375,
374
+ "learning_rate": 2.9243366707223165e-05,
375
+ "loss": 0.2018,
376
+ "num_input_tokens_seen": 61002832,
377
+ "step": 184
378
+ },
379
+ {
380
+ "epoch": 0.38762886597938145,
381
+ "grad_norm": 1.0546875,
382
+ "learning_rate": 2.875653236275632e-05,
383
+ "loss": 0.2072,
384
+ "num_input_tokens_seen": 62262064,
385
+ "step": 188
386
+ },
387
+ {
388
+ "epoch": 0.3958762886597938,
389
+ "grad_norm": 0.92578125,
390
+ "learning_rate": 2.8263191882213362e-05,
391
+ "loss": 0.1936,
392
+ "num_input_tokens_seen": 63678896,
393
+ "step": 192
394
+ },
395
+ {
396
+ "epoch": 0.4041237113402062,
397
+ "grad_norm": 0.9609375,
398
+ "learning_rate": 2.7763711819404098e-05,
399
+ "loss": 0.2069,
400
+ "num_input_tokens_seen": 64844672,
401
+ "step": 196
402
+ },
403
+ {
404
+ "epoch": 0.41237113402061853,
405
+ "grad_norm": 1.0546875,
406
+ "learning_rate": 2.7258463289870764e-05,
407
+ "loss": 0.1924,
408
+ "num_input_tokens_seen": 66274544,
409
+ "step": 200
410
+ },
411
+ {
412
+ "epoch": 0.42061855670103093,
413
+ "grad_norm": 0.88671875,
414
+ "learning_rate": 2.6747821695147806e-05,
415
+ "loss": 0.1949,
416
+ "num_input_tokens_seen": 67683072,
417
+ "step": 204
418
+ },
419
+ {
420
+ "epoch": 0.4288659793814433,
421
+ "grad_norm": 1.125,
422
+ "learning_rate": 2.623216644383715e-05,
423
+ "loss": 0.2092,
424
+ "num_input_tokens_seen": 68860288,
425
+ "step": 208
426
+ },
427
+ {
428
+ "epoch": 0.43711340206185567,
429
+ "grad_norm": 1.078125,
430
+ "learning_rate": 2.5711880669706172e-05,
431
+ "loss": 0.1959,
432
+ "num_input_tokens_seen": 70182736,
433
+ "step": 212
434
+ },
435
+ {
436
+ "epoch": 0.44536082474226807,
437
+ "grad_norm": 0.9296875,
438
+ "learning_rate": 2.5187350947017918e-05,
439
+ "loss": 0.2101,
440
+ "num_input_tokens_seen": 71494624,
441
+ "step": 216
442
+ },
443
+ {
444
+ "epoch": 0.4536082474226804,
445
+ "grad_norm": 0.86328125,
446
+ "learning_rate": 2.4658967003304986e-05,
447
+ "loss": 0.1925,
448
+ "num_input_tokens_seen": 72877248,
449
+ "step": 220
450
+ },
451
+ {
452
+ "epoch": 0.4618556701030928,
453
+ "grad_norm": 1.0078125,
454
+ "learning_rate": 2.4127121429800498e-05,
455
+ "loss": 0.1841,
456
+ "num_input_tokens_seen": 74118560,
457
+ "step": 224
458
+ },
459
+ {
460
+ "epoch": 0.47010309278350515,
461
+ "grad_norm": 0.88671875,
462
+ "learning_rate": 2.3592209389741372e-05,
463
+ "loss": 0.174,
464
+ "num_input_tokens_seen": 75598912,
465
+ "step": 228
466
+ },
467
+ {
468
+ "epoch": 0.47835051546391755,
469
+ "grad_norm": 1.0234375,
470
+ "learning_rate": 2.30546283247606e-05,
471
+ "loss": 0.207,
472
+ "num_input_tokens_seen": 76742752,
473
+ "step": 232
474
+ },
475
+ {
476
+ "epoch": 0.4865979381443299,
477
+ "grad_norm": 1.015625,
478
+ "learning_rate": 2.251477765958655e-05,
479
+ "loss": 0.1932,
480
+ "num_input_tokens_seen": 78206256,
481
+ "step": 236
482
+ },
483
+ {
484
+ "epoch": 0.4948453608247423,
485
+ "grad_norm": 1.0546875,
486
+ "learning_rate": 2.1973058505269007e-05,
487
+ "loss": 0.1946,
488
+ "num_input_tokens_seen": 79491408,
489
+ "step": 240
490
+ },
491
+ {
492
+ "epoch": 0.5030927835051546,
493
+ "grad_norm": 1.0546875,
494
+ "learning_rate": 2.1429873361152124e-05,
495
+ "loss": 0.1975,
496
+ "num_input_tokens_seen": 80718320,
497
+ "step": 244
498
+ },
499
+ {
500
+ "epoch": 0.511340206185567,
501
+ "grad_norm": 0.91015625,
502
+ "learning_rate": 2.088562581581592e-05,
503
+ "loss": 0.1964,
504
+ "num_input_tokens_seen": 81915456,
505
+ "step": 248
506
+ },
507
+ {
508
+ "epoch": 0.5195876288659794,
509
+ "grad_norm": 1.1015625,
510
+ "learning_rate": 2.0340720247208447e-05,
511
+ "loss": 0.191,
512
+ "num_input_tokens_seen": 83180624,
513
+ "step": 252
514
+ },
515
+ {
516
+ "epoch": 0.5278350515463918,
517
+ "grad_norm": 0.90234375,
518
+ "learning_rate": 1.9795561522191523e-05,
519
+ "loss": 0.1832,
520
+ "num_input_tokens_seen": 84571536,
521
+ "step": 256
522
+ },
523
+ {
524
+ "epoch": 0.5360824742268041,
525
+ "grad_norm": 1.046875,
526
+ "learning_rate": 1.9250554695723107e-05,
527
+ "loss": 0.1964,
528
+ "num_input_tokens_seen": 85841328,
529
+ "step": 260
530
+ },
531
+ {
532
+ "epoch": 0.5443298969072164,
533
+ "grad_norm": 0.953125,
534
+ "learning_rate": 1.8706104709899964e-05,
535
+ "loss": 0.1875,
536
+ "num_input_tokens_seen": 87241616,
537
+ "step": 264
538
+ },
539
+ {
540
+ "epoch": 0.5525773195876289,
541
+ "grad_norm": 0.92578125,
542
+ "learning_rate": 1.816261609308419e-05,
543
+ "loss": 0.1809,
544
+ "num_input_tokens_seen": 88600352,
545
+ "step": 268
546
+ },
547
+ {
548
+ "epoch": 0.5608247422680412,
549
+ "grad_norm": 0.87109375,
550
+ "learning_rate": 1.7620492659337155e-05,
551
+ "loss": 0.1793,
552
+ "num_input_tokens_seen": 90051376,
553
+ "step": 272
554
+ },
555
+ {
556
+ "epoch": 0.5690721649484536,
557
+ "grad_norm": 1.0390625,
558
+ "learning_rate": 1.7080137208384122e-05,
559
+ "loss": 0.1865,
560
+ "num_input_tokens_seen": 91429472,
561
+ "step": 276
562
+ },
563
+ {
564
+ "epoch": 0.5773195876288659,
565
+ "grad_norm": 0.9140625,
566
+ "learning_rate": 1.6541951226332565e-05,
567
+ "loss": 0.1745,
568
+ "num_input_tokens_seen": 92791856,
569
+ "step": 280
570
+ },
571
+ {
572
+ "epoch": 0.5855670103092784,
573
+ "grad_norm": 0.875,
574
+ "learning_rate": 1.600633458736653e-05,
575
+ "loss": 0.1925,
576
+ "num_input_tokens_seen": 94068304,
577
+ "step": 284
578
+ },
579
+ {
580
+ "epoch": 0.5938144329896907,
581
+ "grad_norm": 0.98828125,
582
+ "learning_rate": 1.5473685256638572e-05,
583
+ "loss": 0.1903,
584
+ "num_input_tokens_seen": 95338656,
585
+ "step": 288
586
+ },
587
+ {
588
+ "epoch": 0.6020618556701031,
589
+ "grad_norm": 0.90625,
590
+ "learning_rate": 1.4944398994580232e-05,
591
+ "loss": 0.1834,
592
+ "num_input_tokens_seen": 96565872,
593
+ "step": 292
594
+ },
595
+ {
596
+ "epoch": 0.6103092783505155,
597
+ "grad_norm": 1.0,
598
+ "learning_rate": 1.4418869062850514e-05,
599
+ "loss": 0.211,
600
+ "num_input_tokens_seen": 97845776,
601
+ "step": 296
602
+ },
603
+ {
604
+ "epoch": 0.6185567010309279,
605
+ "grad_norm": 0.92578125,
606
+ "learning_rate": 1.3897485932141042e-05,
607
+ "loss": 0.1872,
608
+ "num_input_tokens_seen": 99080048,
609
+ "step": 300
610
+ },
611
+ {
612
+ "epoch": 0.6268041237113402,
613
+ "grad_norm": 0.88671875,
614
+ "learning_rate": 1.3380636992054878e-05,
615
+ "loss": 0.17,
616
+ "num_input_tokens_seen": 100563184,
617
+ "step": 304
618
+ },
619
+ {
620
+ "epoch": 0.6350515463917525,
621
+ "grad_norm": 0.9375,
622
+ "learning_rate": 1.2868706263274602e-05,
623
+ "loss": 0.1935,
624
+ "num_input_tokens_seen": 101820432,
625
+ "step": 308
626
+ },
627
+ {
628
+ "epoch": 0.643298969072165,
629
+ "grad_norm": 0.88671875,
630
+ "learning_rate": 1.236207411223353e-05,
631
+ "loss": 0.1833,
632
+ "num_input_tokens_seen": 103280736,
633
+ "step": 312
634
+ },
635
+ {
636
+ "epoch": 0.6515463917525773,
637
+ "grad_norm": 0.88671875,
638
+ "learning_rate": 1.1861116968502015e-05,
639
+ "loss": 0.1815,
640
+ "num_input_tokens_seen": 104563920,
641
+ "step": 316
642
+ },
643
+ {
644
+ "epoch": 0.6597938144329897,
645
+ "grad_norm": 0.875,
646
+ "learning_rate": 1.136620704509892e-05,
647
+ "loss": 0.1816,
648
+ "num_input_tokens_seen": 105869408,
649
+ "step": 320
650
+ },
651
+ {
652
+ "epoch": 0.668041237113402,
653
+ "grad_norm": 0.9296875,
654
+ "learning_rate": 1.087771206193593e-05,
655
+ "loss": 0.1837,
656
+ "num_input_tokens_seen": 107213792,
657
+ "step": 324
658
+ },
659
+ {
660
+ "epoch": 0.6762886597938145,
661
+ "grad_norm": 0.96484375,
662
+ "learning_rate": 1.0395994972600285e-05,
663
+ "loss": 0.1775,
664
+ "num_input_tokens_seen": 108623536,
665
+ "step": 328
666
+ },
667
+ {
668
+ "epoch": 0.6845360824742268,
669
+ "grad_norm": 0.97265625,
670
+ "learning_rate": 9.921413694678959e-06,
671
+ "loss": 0.2035,
672
+ "num_input_tokens_seen": 109750560,
673
+ "step": 332
674
+ },
675
+ {
676
+ "epoch": 0.6927835051546392,
677
+ "grad_norm": 0.921875,
678
+ "learning_rate": 9.454320843824512e-06,
679
+ "loss": 0.1862,
680
+ "num_input_tokens_seen": 111023152,
681
+ "step": 336
682
+ },
683
+ {
684
+ "epoch": 0.7010309278350515,
685
+ "grad_norm": 0.95703125,
686
+ "learning_rate": 8.995063471760377e-06,
687
+ "loss": 0.1927,
688
+ "num_input_tokens_seen": 112284320,
689
+ "step": 340
690
+ },
691
+ {
692
+ "epoch": 0.709278350515464,
693
+ "grad_norm": 0.96484375,
694
+ "learning_rate": 8.543982808420156e-06,
695
+ "loss": 0.1856,
696
+ "num_input_tokens_seen": 113630688,
697
+ "step": 344
698
+ },
699
+ {
700
+ "epoch": 0.7175257731958763,
701
+ "grad_norm": 0.953125,
702
+ "learning_rate": 8.101414008412469e-06,
703
+ "loss": 0.1792,
704
+ "num_input_tokens_seen": 114946320,
705
+ "step": 348
706
+ },
707
+ {
708
+ "epoch": 0.7257731958762886,
709
+ "grad_norm": 0.94921875,
710
+ "learning_rate": 7.667685901999875e-06,
711
+ "loss": 0.1891,
712
+ "num_input_tokens_seen": 116220208,
713
+ "step": 352
714
+ },
715
+ {
716
+ "epoch": 0.734020618556701,
717
+ "grad_norm": 0.8671875,
718
+ "learning_rate": 7.24312075077674e-06,
719
+ "loss": 0.1891,
720
+ "num_input_tokens_seen": 117614672,
721
+ "step": 356
722
+ },
723
+ {
724
+ "epoch": 0.7422680412371134,
725
+ "grad_norm": 1.0859375,
726
+ "learning_rate": 6.828034008227678e-06,
727
+ "loss": 0.1714,
728
+ "num_input_tokens_seen": 118996816,
729
+ "step": 360
730
+ },
731
+ {
732
+ "epoch": 0.7505154639175258,
733
+ "grad_norm": 0.90234375,
734
+ "learning_rate": 6.422734085344464e-06,
735
+ "loss": 0.1871,
736
+ "num_input_tokens_seen": 120229232,
737
+ "step": 364
738
+ },
739
+ {
740
+ "epoch": 0.7587628865979381,
741
+ "grad_norm": 0.8203125,
742
+ "learning_rate": 6.027522121475482e-06,
743
+ "loss": 0.1795,
744
+ "num_input_tokens_seen": 121495936,
745
+ "step": 368
746
+ },
747
+ {
748
+ "epoch": 0.7670103092783506,
749
+ "grad_norm": 0.95703125,
750
+ "learning_rate": 5.642691760578116e-06,
751
+ "loss": 0.1833,
752
+ "num_input_tokens_seen": 122787872,
753
+ "step": 372
754
+ },
755
+ {
756
+ "epoch": 0.7752577319587629,
757
+ "grad_norm": 0.83984375,
758
+ "learning_rate": 5.268528933040147e-06,
759
+ "loss": 0.1673,
760
+ "num_input_tokens_seen": 124257600,
761
+ "step": 376
762
+ },
763
+ {
764
+ "epoch": 0.7835051546391752,
765
+ "grad_norm": 0.875,
766
+ "learning_rate": 4.905311643232464e-06,
767
+ "loss": 0.1763,
768
+ "num_input_tokens_seen": 125705408,
769
+ "step": 380
770
+ },
771
+ {
772
+ "epoch": 0.7917525773195876,
773
+ "grad_norm": 0.9765625,
774
+ "learning_rate": 4.553309762950739e-06,
775
+ "loss": 0.1877,
776
+ "num_input_tokens_seen": 126862272,
777
+ "step": 384
778
+ },
779
+ {
780
+ "epoch": 0.8,
781
+ "grad_norm": 0.90625,
782
+ "learning_rate": 4.212784830899725e-06,
783
+ "loss": 0.1795,
784
+ "num_input_tokens_seen": 128153600,
785
+ "step": 388
786
+ },
787
+ {
788
+ "epoch": 0.8082474226804124,
789
+ "grad_norm": 0.828125,
790
+ "learning_rate": 3.8839898583689725e-06,
791
+ "loss": 0.1803,
792
+ "num_input_tokens_seen": 129461872,
793
+ "step": 392
794
+ },
795
+ {
796
+ "epoch": 0.8164948453608247,
797
+ "grad_norm": 0.88671875,
798
+ "learning_rate": 3.567169141244562e-06,
799
+ "loss": 0.179,
800
+ "num_input_tokens_seen": 130662064,
801
+ "step": 396
802
+ },
803
+ {
804
+ "epoch": 0.8247422680412371,
805
+ "grad_norm": 0.8359375,
806
+ "learning_rate": 3.262558078496301e-06,
807
+ "loss": 0.1679,
808
+ "num_input_tokens_seen": 131997568,
809
+ "step": 400
810
+ },
811
+ {
812
+ "epoch": 0.8329896907216495,
813
+ "grad_norm": 0.94921875,
814
+ "learning_rate": 2.9703829972754407e-06,
815
+ "loss": 0.1974,
816
+ "num_input_tokens_seen": 133415744,
817
+ "step": 404
818
+ },
819
+ {
820
+ "epoch": 0.8412371134020619,
821
+ "grad_norm": 0.828125,
822
+ "learning_rate": 2.69086098475277e-06,
823
+ "loss": 0.1699,
824
+ "num_input_tokens_seen": 134815840,
825
+ "step": 408
826
+ },
827
+ {
828
+ "epoch": 0.8494845360824742,
829
+ "grad_norm": 0.9921875,
830
+ "learning_rate": 2.4241997268220096e-06,
831
+ "loss": 0.1815,
832
+ "num_input_tokens_seen": 136262128,
833
+ "step": 412
834
+ },
835
+ {
836
+ "epoch": 0.8577319587628865,
837
+ "grad_norm": 0.96484375,
838
+ "learning_rate": 2.1705973537884615e-06,
839
+ "loss": 0.1781,
840
+ "num_input_tokens_seen": 137430160,
841
+ "step": 416
842
+ },
843
+ {
844
+ "epoch": 0.865979381443299,
845
+ "grad_norm": 0.95703125,
846
+ "learning_rate": 1.9302422931574183e-06,
847
+ "loss": 0.1899,
848
+ "num_input_tokens_seen": 138709200,
849
+ "step": 420
850
+ },
851
+ {
852
+ "epoch": 0.8742268041237113,
853
+ "grad_norm": 0.83984375,
854
+ "learning_rate": 1.7033131296318473e-06,
855
+ "loss": 0.1795,
856
+ "num_input_tokens_seen": 140033680,
857
+ "step": 424
858
+ },
859
+ {
860
+ "epoch": 0.8824742268041237,
861
+ "grad_norm": 0.8203125,
862
+ "learning_rate": 1.4899784724232968e-06,
863
+ "loss": 0.1749,
864
+ "num_input_tokens_seen": 141348848,
865
+ "step": 428
866
+ },
867
+ {
868
+ "epoch": 0.8907216494845361,
869
+ "grad_norm": 0.83984375,
870
+ "learning_rate": 1.2903968299746094e-06,
871
+ "loss": 0.171,
872
+ "num_input_tokens_seen": 142797664,
873
+ "step": 432
874
+ },
875
+ {
876
+ "epoch": 0.8989690721649485,
877
+ "grad_norm": 0.89453125,
878
+ "learning_rate": 1.104716492187574e-06,
879
+ "loss": 0.1812,
880
+ "num_input_tokens_seen": 144154208,
881
+ "step": 436
882
+ },
883
+ {
884
+ "epoch": 0.9072164948453608,
885
+ "grad_norm": 1.03125,
886
+ "learning_rate": 9.330754202429726e-07,
887
+ "loss": 0.1891,
888
+ "num_input_tokens_seen": 145332560,
889
+ "step": 440
890
+ },
891
+ {
892
+ "epoch": 0.9154639175257732,
893
+ "grad_norm": 0.98046875,
894
+ "learning_rate": 7.756011440948996e-07,
895
+ "loss": 0.1902,
896
+ "num_input_tokens_seen": 146527344,
897
+ "step": 444
898
+ },
899
+ {
900
+ "epoch": 0.9237113402061856,
901
+ "grad_norm": 1.0,
902
+ "learning_rate": 6.324106677155573e-07,
903
+ "loss": 0.1793,
904
+ "num_input_tokens_seen": 147821568,
905
+ "step": 448
906
+ },
907
+ {
908
+ "epoch": 0.931958762886598,
909
+ "grad_norm": 0.88671875,
910
+ "learning_rate": 5.036103821608485e-07,
911
+ "loss": 0.1844,
912
+ "num_input_tokens_seen": 149191664,
913
+ "step": 452
914
+ },
915
+ {
916
+ "epoch": 0.9402061855670103,
917
+ "grad_norm": 0.8515625,
918
+ "learning_rate": 3.892959865214363e-07,
919
+ "loss": 0.1795,
920
+ "num_input_tokens_seen": 150526864,
921
+ "step": 456
922
+ },
923
+ {
924
+ "epoch": 0.9484536082474226,
925
+ "grad_norm": 0.9453125,
926
+ "learning_rate": 2.8955241681795534e-07,
927
+ "loss": 0.1859,
928
+ "num_input_tokens_seen": 151861952,
929
+ "step": 460
930
+ },
931
+ {
932
+ "epoch": 0.9567010309278351,
933
+ "grad_norm": 0.71875,
934
+ "learning_rate": 2.044537828932458e-07,
935
+ "loss": 0.1787,
936
+ "num_input_tokens_seen": 153201488,
937
+ "step": 464
938
+ },
939
+ {
940
+ "epoch": 0.9649484536082474,
941
+ "grad_norm": 0.87109375,
942
+ "learning_rate": 1.3406331334845813e-07,
943
+ "loss": 0.1884,
944
+ "num_input_tokens_seen": 154511184,
945
+ "step": 468
946
+ },
947
+ {
948
+ "epoch": 0.9731958762886598,
949
+ "grad_norm": 0.796875,
950
+ "learning_rate": 7.843330856396103e-08,
951
+ "loss": 0.1858,
952
+ "num_input_tokens_seen": 155737200,
953
+ "step": 472
954
+ },
955
+ {
956
+ "epoch": 0.9814432989690721,
957
+ "grad_norm": 0.8671875,
958
+ "learning_rate": 3.760510183997701e-08,
959
+ "loss": 0.183,
960
+ "num_input_tokens_seen": 157084960,
961
+ "step": 476
962
+ },
963
+ {
964
+ "epoch": 0.9896907216494846,
965
+ "grad_norm": 0.93359375,
966
+ "learning_rate": 1.160902868577951e-08,
967
+ "loss": 0.1908,
968
+ "num_input_tokens_seen": 158349904,
969
+ "step": 480
970
+ },
971
+ {
972
+ "epoch": 0.9979381443298969,
973
+ "grad_norm": 0.8984375,
974
+ "learning_rate": 4.64404280295927e-10,
975
+ "loss": 0.1764,
976
+ "num_input_tokens_seen": 159669152,
977
+ "step": 484
978
+ },
979
+ {
980
+ "epoch": 1.0,
981
+ "eval_loss": 0.0952233299612999,
982
+ "eval_runtime": 83.4314,
983
+ "eval_samples_per_second": 12.453,
984
+ "eval_steps_per_second": 0.396,
985
+ "num_input_tokens_seen": 160041824,
986
+ "step": 485
987
+ },
988
+ {
989
+ "epoch": 1.0,
990
+ "num_input_tokens_seen": 160041824,
991
+ "step": 485,
992
+ "total_flos": 9.01334912177537e+17,
993
+ "train_loss": 0.20323730206366666,
994
+ "train_runtime": 14397.8817,
995
+ "train_samples_per_second": 4.306,
996
+ "train_steps_per_second": 0.034,
997
+ "train_tokens_per_second": 1386.226
998
+ }
999
+ ],
1000
+ "logging_steps": 4,
1001
+ "max_steps": 485,
1002
+ "num_input_tokens_seen": 160041824,
1003
+ "num_train_epochs": 1,
1004
+ "save_steps": 0,
1005
+ "stateful_callbacks": {
1006
+ "TrainerControl": {
1007
+ "args": {
1008
+ "should_epoch_stop": false,
1009
+ "should_evaluate": false,
1010
+ "should_log": false,
1011
+ "should_save": false,
1012
+ "should_training_stop": false
1013
+ },
1014
+ "attributes": {}
1015
+ }
1016
+ },
1017
+ "total_flos": 9.01334912177537e+17,
1018
+ "train_batch_size": 16,
1019
+ "trial_name": null,
1020
+ "trial_params": null
1021
+ }