jefson08 commited on
Commit
adf6665
·
verified ·
1 Parent(s): 9b5ca51

Upload 10 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. model.safetensors +1 -1
  3. optimizer.pt +3 -0
  4. rng_state.pth +0 -0
  5. scheduler.pt +0 -0
  6. trainer_state.json +1497 -0
.gitattributes CHANGED
@@ -1,3 +1,4 @@
1
  model.safetensors filter=lfs diff=lfs merge=lfs -text
2
  model.SRC filter=lfs diff=lfs merge=lfs -text
3
  model.TGT filter=lfs diff=lfs merge=lfs -text
 
 
1
  model.safetensors filter=lfs diff=lfs merge=lfs -text
2
  model.SRC filter=lfs diff=lfs merge=lfs -text
3
  model.TGT filter=lfs diff=lfs merge=lfs -text
4
+ optimizer.pt filter=lfs diff=lfs merge=lfs -text
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea67a27d22d8aed4603f79437cd9a36cdf096acf2c62df9dd9d20d7343546e27
3
  size 2247492800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b807377b9259c63916b600d8d5eb99c70dc4a2085059628e505000b277a9f84b
3
  size 2247492800
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8171315046191596a4cd9a881659f2157b1d1a3b94517b2186ee617eba4b2515
3
+ size 4495445235
rng_state.pth ADDED
Binary file (14.3 kB). View file
 
scheduler.pt ADDED
Binary file (1.06 kB). View file
 
trainer_state.json ADDED
@@ -0,0 +1,1497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 34.19371467731715,
3
+ "best_model_checkpoint": "indictrans-en-ne-checkpoint-1B-2/checkpoint-12000",
4
+ "epoch": 1.9999086966446016,
5
+ "eval_steps": 1200,
6
+ "global_step": 16428,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.012173780719774785,
13
+ "grad_norm": 0.1044921875,
14
+ "learning_rate": 0.0001414213562373095,
15
+ "loss": 0.1798,
16
+ "num_input_tokens_seen": 3276800,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.02434756143954957,
21
+ "grad_norm": 0.0908203125,
22
+ "learning_rate": 0.0001,
23
+ "loss": 0.1787,
24
+ "num_input_tokens_seen": 6553600,
25
+ "step": 200
26
+ },
27
+ {
28
+ "epoch": 0.03652134215932436,
29
+ "grad_norm": 0.08349609375,
30
+ "learning_rate": 8.164965809277262e-05,
31
+ "loss": 0.1784,
32
+ "num_input_tokens_seen": 9830400,
33
+ "step": 300
34
+ },
35
+ {
36
+ "epoch": 0.04869512287909914,
37
+ "grad_norm": 0.08203125,
38
+ "learning_rate": 7.071067811865475e-05,
39
+ "loss": 0.1775,
40
+ "num_input_tokens_seen": 13107200,
41
+ "step": 400
42
+ },
43
+ {
44
+ "epoch": 0.060868903598873925,
45
+ "grad_norm": 0.08740234375,
46
+ "learning_rate": 6.324555320336759e-05,
47
+ "loss": 0.177,
48
+ "num_input_tokens_seen": 16384000,
49
+ "step": 500
50
+ },
51
+ {
52
+ "epoch": 0.07304268431864872,
53
+ "grad_norm": 0.09228515625,
54
+ "learning_rate": 5.7735026918962585e-05,
55
+ "loss": 0.1757,
56
+ "num_input_tokens_seen": 19660800,
57
+ "step": 600
58
+ },
59
+ {
60
+ "epoch": 0.08521646503842349,
61
+ "grad_norm": 0.091796875,
62
+ "learning_rate": 5.3452248382484884e-05,
63
+ "loss": 0.1749,
64
+ "num_input_tokens_seen": 22937600,
65
+ "step": 700
66
+ },
67
+ {
68
+ "epoch": 0.09739024575819828,
69
+ "grad_norm": 0.08154296875,
70
+ "learning_rate": 5e-05,
71
+ "loss": 0.1745,
72
+ "num_input_tokens_seen": 26214400,
73
+ "step": 800
74
+ },
75
+ {
76
+ "epoch": 0.10956402647797306,
77
+ "grad_norm": 0.0908203125,
78
+ "learning_rate": 4.7140452079103176e-05,
79
+ "loss": 0.1728,
80
+ "num_input_tokens_seen": 29491200,
81
+ "step": 900
82
+ },
83
+ {
84
+ "epoch": 0.12173780719774785,
85
+ "grad_norm": 0.0791015625,
86
+ "learning_rate": 4.4721359549995795e-05,
87
+ "loss": 0.175,
88
+ "num_input_tokens_seen": 32768000,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 0.13391158791752264,
93
+ "grad_norm": 0.09765625,
94
+ "learning_rate": 4.264014327112208e-05,
95
+ "loss": 0.1757,
96
+ "num_input_tokens_seen": 36044800,
97
+ "step": 1100
98
+ },
99
+ {
100
+ "epoch": 0.14608536863729743,
101
+ "grad_norm": 0.0830078125,
102
+ "learning_rate": 4.082482904638631e-05,
103
+ "loss": 0.1718,
104
+ "num_input_tokens_seen": 39321600,
105
+ "step": 1200
106
+ },
107
+ {
108
+ "epoch": 0.14608536863729743,
109
+ "eval_BLEU": 33.851984753329205,
110
+ "eval_chrF": 59.48761000963931,
111
+ "eval_loss": 0.1480654925107956,
112
+ "eval_runtime": 4021.48,
113
+ "eval_samples_per_second": 4.692,
114
+ "eval_steps_per_second": 0.293,
115
+ "num_input_tokens_seen": 39321600,
116
+ "step": 1200
117
+ },
118
+ {
119
+ "epoch": 0.1582591493570722,
120
+ "grad_norm": 0.08544921875,
121
+ "learning_rate": 3.922322702763681e-05,
122
+ "loss": 0.1719,
123
+ "num_input_tokens_seen": 42598400,
124
+ "step": 1300
125
+ },
126
+ {
127
+ "epoch": 0.17043293007684698,
128
+ "grad_norm": 0.0908203125,
129
+ "learning_rate": 3.779644730092272e-05,
130
+ "loss": 0.1718,
131
+ "num_input_tokens_seen": 45875200,
132
+ "step": 1400
133
+ },
134
+ {
135
+ "epoch": 0.18260671079662177,
136
+ "grad_norm": 0.08837890625,
137
+ "learning_rate": 3.651483716701107e-05,
138
+ "loss": 0.1685,
139
+ "num_input_tokens_seen": 49152000,
140
+ "step": 1500
141
+ },
142
+ {
143
+ "epoch": 0.19478049151639656,
144
+ "grad_norm": 0.09130859375,
145
+ "learning_rate": 3.535533905932738e-05,
146
+ "loss": 0.1742,
147
+ "num_input_tokens_seen": 52428800,
148
+ "step": 1600
149
+ },
150
+ {
151
+ "epoch": 0.20695427223617135,
152
+ "grad_norm": 0.09033203125,
153
+ "learning_rate": 3.4299717028501764e-05,
154
+ "loss": 0.1736,
155
+ "num_input_tokens_seen": 55705600,
156
+ "step": 1700
157
+ },
158
+ {
159
+ "epoch": 0.21912805295594612,
160
+ "grad_norm": 0.10498046875,
161
+ "learning_rate": 3.3333333333333335e-05,
162
+ "loss": 0.1714,
163
+ "num_input_tokens_seen": 58982400,
164
+ "step": 1800
165
+ },
166
+ {
167
+ "epoch": 0.2313018336757209,
168
+ "grad_norm": 0.08447265625,
169
+ "learning_rate": 3.244428422615251e-05,
170
+ "loss": 0.1718,
171
+ "num_input_tokens_seen": 62259200,
172
+ "step": 1900
173
+ },
174
+ {
175
+ "epoch": 0.2434756143954957,
176
+ "grad_norm": 0.08544921875,
177
+ "learning_rate": 3.1622776601683795e-05,
178
+ "loss": 0.1709,
179
+ "num_input_tokens_seen": 65536000,
180
+ "step": 2000
181
+ },
182
+ {
183
+ "epoch": 0.25564939511527046,
184
+ "grad_norm": 0.0908203125,
185
+ "learning_rate": 3.086066999241838e-05,
186
+ "loss": 0.175,
187
+ "num_input_tokens_seen": 68812800,
188
+ "step": 2100
189
+ },
190
+ {
191
+ "epoch": 0.2678231758350453,
192
+ "grad_norm": 0.09521484375,
193
+ "learning_rate": 3.0151134457776364e-05,
194
+ "loss": 0.1706,
195
+ "num_input_tokens_seen": 72089600,
196
+ "step": 2200
197
+ },
198
+ {
199
+ "epoch": 0.27999695655482004,
200
+ "grad_norm": 0.0966796875,
201
+ "learning_rate": 2.948839123097943e-05,
202
+ "loss": 0.1758,
203
+ "num_input_tokens_seen": 75366400,
204
+ "step": 2300
205
+ },
206
+ {
207
+ "epoch": 0.29217073727459486,
208
+ "grad_norm": 0.080078125,
209
+ "learning_rate": 2.8867513459481293e-05,
210
+ "loss": 0.1741,
211
+ "num_input_tokens_seen": 78643200,
212
+ "step": 2400
213
+ },
214
+ {
215
+ "epoch": 0.29217073727459486,
216
+ "eval_BLEU": 34.039957674706194,
217
+ "eval_chrF": 59.611408479987894,
218
+ "eval_loss": 0.14743424952030182,
219
+ "eval_runtime": 4005.0853,
220
+ "eval_samples_per_second": 4.712,
221
+ "eval_steps_per_second": 0.295,
222
+ "num_input_tokens_seen": 78643200,
223
+ "step": 2400
224
+ },
225
+ {
226
+ "epoch": 0.3043445179943696,
227
+ "grad_norm": 0.08544921875,
228
+ "learning_rate": 2.8284271247461902e-05,
229
+ "loss": 0.1684,
230
+ "num_input_tokens_seen": 81920000,
231
+ "step": 2500
232
+ },
233
+ {
234
+ "epoch": 0.3165182987141444,
235
+ "grad_norm": 0.08642578125,
236
+ "learning_rate": 2.7735009811261458e-05,
237
+ "loss": 0.1729,
238
+ "num_input_tokens_seen": 85196800,
239
+ "step": 2600
240
+ },
241
+ {
242
+ "epoch": 0.3286920794339192,
243
+ "grad_norm": 0.0888671875,
244
+ "learning_rate": 2.721655269759087e-05,
245
+ "loss": 0.1753,
246
+ "num_input_tokens_seen": 88473600,
247
+ "step": 2700
248
+ },
249
+ {
250
+ "epoch": 0.34086586015369397,
251
+ "grad_norm": 0.09423828125,
252
+ "learning_rate": 2.6726124191242442e-05,
253
+ "loss": 0.1702,
254
+ "num_input_tokens_seen": 91750400,
255
+ "step": 2800
256
+ },
257
+ {
258
+ "epoch": 0.3530396408734688,
259
+ "grad_norm": 0.10107421875,
260
+ "learning_rate": 2.626128657194451e-05,
261
+ "loss": 0.1743,
262
+ "num_input_tokens_seen": 95027200,
263
+ "step": 2900
264
+ },
265
+ {
266
+ "epoch": 0.36521342159324355,
267
+ "grad_norm": 0.0927734375,
268
+ "learning_rate": 2.581988897471611e-05,
269
+ "loss": 0.1696,
270
+ "num_input_tokens_seen": 98304000,
271
+ "step": 3000
272
+ },
273
+ {
274
+ "epoch": 0.3773872023130183,
275
+ "grad_norm": 0.0810546875,
276
+ "learning_rate": 2.5400025400038102e-05,
277
+ "loss": 0.1724,
278
+ "num_input_tokens_seen": 101580800,
279
+ "step": 3100
280
+ },
281
+ {
282
+ "epoch": 0.38956098303279313,
283
+ "grad_norm": 0.0859375,
284
+ "learning_rate": 2.5e-05,
285
+ "loss": 0.1746,
286
+ "num_input_tokens_seen": 104857600,
287
+ "step": 3200
288
+ },
289
+ {
290
+ "epoch": 0.4017347637525679,
291
+ "grad_norm": 0.095703125,
292
+ "learning_rate": 2.4618298195866546e-05,
293
+ "loss": 0.1709,
294
+ "num_input_tokens_seen": 108134400,
295
+ "step": 3300
296
+ },
297
+ {
298
+ "epoch": 0.4139085444723427,
299
+ "grad_norm": 0.087890625,
300
+ "learning_rate": 2.42535625036333e-05,
301
+ "loss": 0.1707,
302
+ "num_input_tokens_seen": 111411200,
303
+ "step": 3400
304
+ },
305
+ {
306
+ "epoch": 0.4260823251921175,
307
+ "grad_norm": 0.09228515625,
308
+ "learning_rate": 2.3904572186687872e-05,
309
+ "loss": 0.171,
310
+ "num_input_tokens_seen": 114688000,
311
+ "step": 3500
312
+ },
313
+ {
314
+ "epoch": 0.43825610591189224,
315
+ "grad_norm": 0.0869140625,
316
+ "learning_rate": 2.3570226039551588e-05,
317
+ "loss": 0.1728,
318
+ "num_input_tokens_seen": 117964800,
319
+ "step": 3600
320
+ },
321
+ {
322
+ "epoch": 0.43825610591189224,
323
+ "eval_BLEU": 34.1284026902063,
324
+ "eval_chrF": 59.6592142520761,
325
+ "eval_loss": 0.14721617102622986,
326
+ "eval_runtime": 3969.206,
327
+ "eval_samples_per_second": 4.754,
328
+ "eval_steps_per_second": 0.297,
329
+ "num_input_tokens_seen": 117964800,
330
+ "step": 3600
331
+ },
332
+ {
333
+ "epoch": 0.45042988663166705,
334
+ "grad_norm": 0.0966796875,
335
+ "learning_rate": 2.324952774876386e-05,
336
+ "loss": 0.1708,
337
+ "num_input_tokens_seen": 121241600,
338
+ "step": 3700
339
+ },
340
+ {
341
+ "epoch": 0.4626036673514418,
342
+ "grad_norm": 0.08984375,
343
+ "learning_rate": 2.2941573387056174e-05,
344
+ "loss": 0.1705,
345
+ "num_input_tokens_seen": 124518400,
346
+ "step": 3800
347
+ },
348
+ {
349
+ "epoch": 0.47477744807121663,
350
+ "grad_norm": 0.0966796875,
351
+ "learning_rate": 2.2645540682891912e-05,
352
+ "loss": 0.1697,
353
+ "num_input_tokens_seen": 127795200,
354
+ "step": 3900
355
+ },
356
+ {
357
+ "epoch": 0.4869512287909914,
358
+ "grad_norm": 0.091796875,
359
+ "learning_rate": 2.2360679774997898e-05,
360
+ "loss": 0.1722,
361
+ "num_input_tokens_seen": 131072000,
362
+ "step": 4000
363
+ },
364
+ {
365
+ "epoch": 0.49912500951076616,
366
+ "grad_norm": 0.07958984375,
367
+ "learning_rate": 2.2086305214969307e-05,
368
+ "loss": 0.1696,
369
+ "num_input_tokens_seen": 134348800,
370
+ "step": 4100
371
+ },
372
+ {
373
+ "epoch": 0.5112987902305409,
374
+ "grad_norm": 0.08984375,
375
+ "learning_rate": 2.182178902359924e-05,
376
+ "loss": 0.1696,
377
+ "num_input_tokens_seen": 137625600,
378
+ "step": 4200
379
+ },
380
+ {
381
+ "epoch": 0.5234725709503157,
382
+ "grad_norm": 0.0869140625,
383
+ "learning_rate": 2.1566554640687683e-05,
384
+ "loss": 0.1721,
385
+ "num_input_tokens_seen": 140902400,
386
+ "step": 4300
387
+ },
388
+ {
389
+ "epoch": 0.5356463516700906,
390
+ "grad_norm": 0.0791015625,
391
+ "learning_rate": 2.132007163556104e-05,
392
+ "loss": 0.1706,
393
+ "num_input_tokens_seen": 144179200,
394
+ "step": 4400
395
+ },
396
+ {
397
+ "epoch": 0.5478201323898654,
398
+ "grad_norm": 0.0849609375,
399
+ "learning_rate": 2.1081851067789197e-05,
400
+ "loss": 0.1726,
401
+ "num_input_tokens_seen": 147456000,
402
+ "step": 4500
403
+ },
404
+ {
405
+ "epoch": 0.5599939131096401,
406
+ "grad_norm": 0.08984375,
407
+ "learning_rate": 2.0851441405707478e-05,
408
+ "loss": 0.1688,
409
+ "num_input_tokens_seen": 150732800,
410
+ "step": 4600
411
+ },
412
+ {
413
+ "epoch": 0.5721676938294149,
414
+ "grad_norm": 0.08544921875,
415
+ "learning_rate": 2.062842492517587e-05,
416
+ "loss": 0.1733,
417
+ "num_input_tokens_seen": 154009600,
418
+ "step": 4700
419
+ },
420
+ {
421
+ "epoch": 0.5843414745491897,
422
+ "grad_norm": 0.12158203125,
423
+ "learning_rate": 2.0412414523193156e-05,
424
+ "loss": 0.1692,
425
+ "num_input_tokens_seen": 157286400,
426
+ "step": 4800
427
+ },
428
+ {
429
+ "epoch": 0.5843414745491897,
430
+ "eval_BLEU": 34.08945609042042,
431
+ "eval_chrF": 59.66085393977354,
432
+ "eval_loss": 0.14703597128391266,
433
+ "eval_runtime": 3968.308,
434
+ "eval_samples_per_second": 4.755,
435
+ "eval_steps_per_second": 0.297,
436
+ "num_input_tokens_seen": 157286400,
437
+ "step": 4800
438
+ },
439
+ {
440
+ "epoch": 0.5965152552689644,
441
+ "grad_norm": 0.087890625,
442
+ "learning_rate": 2.0203050891044213e-05,
443
+ "loss": 0.1717,
444
+ "num_input_tokens_seen": 160563200,
445
+ "step": 4900
446
+ },
447
+ {
448
+ "epoch": 0.6086890359887392,
449
+ "grad_norm": 0.0908203125,
450
+ "learning_rate": 2e-05,
451
+ "loss": 0.169,
452
+ "num_input_tokens_seen": 163840000,
453
+ "step": 5000
454
+ },
455
+ {
456
+ "epoch": 0.6208628167085141,
457
+ "grad_norm": 0.09619140625,
458
+ "learning_rate": 1.980295085953349e-05,
459
+ "loss": 0.1722,
460
+ "num_input_tokens_seen": 167116800,
461
+ "step": 5100
462
+ },
463
+ {
464
+ "epoch": 0.6330365974282888,
465
+ "grad_norm": 0.09326171875,
466
+ "learning_rate": 1.9611613513818405e-05,
467
+ "loss": 0.1715,
468
+ "num_input_tokens_seen": 170393600,
469
+ "step": 5200
470
+ },
471
+ {
472
+ "epoch": 0.6452103781480636,
473
+ "grad_norm": 0.08203125,
474
+ "learning_rate": 1.9425717247145284e-05,
475
+ "loss": 0.171,
476
+ "num_input_tokens_seen": 173670400,
477
+ "step": 5300
478
+ },
479
+ {
480
+ "epoch": 0.6573841588678384,
481
+ "grad_norm": 0.08740234375,
482
+ "learning_rate": 1.9245008972987527e-05,
483
+ "loss": 0.1702,
484
+ "num_input_tokens_seen": 176947200,
485
+ "step": 5400
486
+ },
487
+ {
488
+ "epoch": 0.6695579395876132,
489
+ "grad_norm": 0.07958984375,
490
+ "learning_rate": 1.906925178491185e-05,
491
+ "loss": 0.1719,
492
+ "num_input_tokens_seen": 180224000,
493
+ "step": 5500
494
+ },
495
+ {
496
+ "epoch": 0.6817317203073879,
497
+ "grad_norm": 0.08642578125,
498
+ "learning_rate": 1.889822365046136e-05,
499
+ "loss": 0.1727,
500
+ "num_input_tokens_seen": 183500800,
501
+ "step": 5600
502
+ },
503
+ {
504
+ "epoch": 0.6939055010271628,
505
+ "grad_norm": 0.0927734375,
506
+ "learning_rate": 1.873171623163388e-05,
507
+ "loss": 0.1708,
508
+ "num_input_tokens_seen": 186777600,
509
+ "step": 5700
510
+ },
511
+ {
512
+ "epoch": 0.7060792817469376,
513
+ "grad_norm": 0.0888671875,
514
+ "learning_rate": 1.8569533817705186e-05,
515
+ "loss": 0.1694,
516
+ "num_input_tokens_seen": 190054400,
517
+ "step": 5800
518
+ },
519
+ {
520
+ "epoch": 0.7182530624667123,
521
+ "grad_norm": 0.08349609375,
522
+ "learning_rate": 1.841149235796647e-05,
523
+ "loss": 0.1741,
524
+ "num_input_tokens_seen": 193331200,
525
+ "step": 5900
526
+ },
527
+ {
528
+ "epoch": 0.7304268431864871,
529
+ "grad_norm": 0.08984375,
530
+ "learning_rate": 1.8257418583505536e-05,
531
+ "loss": 0.1693,
532
+ "num_input_tokens_seen": 196608000,
533
+ "step": 6000
534
+ },
535
+ {
536
+ "epoch": 0.7304268431864871,
537
+ "eval_BLEU": 34.04924804951594,
538
+ "eval_chrF": 59.694436913924406,
539
+ "eval_loss": 0.14692962169647217,
540
+ "eval_runtime": 3965.0872,
541
+ "eval_samples_per_second": 4.759,
542
+ "eval_steps_per_second": 0.298,
543
+ "num_input_tokens_seen": 196608000,
544
+ "step": 6000
545
+ },
546
+ {
547
+ "epoch": 0.7426006239062619,
548
+ "grad_norm": 0.0830078125,
549
+ "learning_rate": 1.8107149208503708e-05,
550
+ "loss": 0.1709,
551
+ "num_input_tokens_seen": 199884800,
552
+ "step": 6100
553
+ },
554
+ {
555
+ "epoch": 0.7547744046260366,
556
+ "grad_norm": 0.08740234375,
557
+ "learning_rate": 1.7960530202677492e-05,
558
+ "loss": 0.1687,
559
+ "num_input_tokens_seen": 203161600,
560
+ "step": 6200
561
+ },
562
+ {
563
+ "epoch": 0.7669481853458114,
564
+ "grad_norm": 0.08544921875,
565
+ "learning_rate": 1.781741612749496e-05,
566
+ "loss": 0.1728,
567
+ "num_input_tokens_seen": 206438400,
568
+ "step": 6300
569
+ },
570
+ {
571
+ "epoch": 0.7791219660655863,
572
+ "grad_norm": 0.08447265625,
573
+ "learning_rate": 1.767766952966369e-05,
574
+ "loss": 0.171,
575
+ "num_input_tokens_seen": 209715200,
576
+ "step": 6400
577
+ },
578
+ {
579
+ "epoch": 0.7912957467853611,
580
+ "grad_norm": 0.08447265625,
581
+ "learning_rate": 1.7541160386140587e-05,
582
+ "loss": 0.1709,
583
+ "num_input_tokens_seen": 212992000,
584
+ "step": 6500
585
+ },
586
+ {
587
+ "epoch": 0.8034695275051358,
588
+ "grad_norm": 0.0849609375,
589
+ "learning_rate": 1.7407765595569787e-05,
590
+ "loss": 0.1709,
591
+ "num_input_tokens_seen": 216268800,
592
+ "step": 6600
593
+ },
594
+ {
595
+ "epoch": 0.8156433082249106,
596
+ "grad_norm": 0.09130859375,
597
+ "learning_rate": 1.7277368511627203e-05,
598
+ "loss": 0.1705,
599
+ "num_input_tokens_seen": 219545600,
600
+ "step": 6700
601
+ },
602
+ {
603
+ "epoch": 0.8278170889446854,
604
+ "grad_norm": 0.07861328125,
605
+ "learning_rate": 1.7149858514250882e-05,
606
+ "loss": 0.1709,
607
+ "num_input_tokens_seen": 222822400,
608
+ "step": 6800
609
+ },
610
+ {
611
+ "epoch": 0.8399908696644601,
612
+ "grad_norm": 0.08837890625,
613
+ "learning_rate": 1.7025130615174974e-05,
614
+ "loss": 0.1739,
615
+ "num_input_tokens_seen": 226099200,
616
+ "step": 6900
617
+ },
618
+ {
619
+ "epoch": 0.852164650384235,
620
+ "grad_norm": 0.08984375,
621
+ "learning_rate": 1.690308509457033e-05,
622
+ "loss": 0.1709,
623
+ "num_input_tokens_seen": 229376000,
624
+ "step": 7000
625
+ },
626
+ {
627
+ "epoch": 0.8643384311040098,
628
+ "grad_norm": 0.083984375,
629
+ "learning_rate": 1.6783627165933782e-05,
630
+ "loss": 0.1733,
631
+ "num_input_tokens_seen": 232652800,
632
+ "step": 7100
633
+ },
634
+ {
635
+ "epoch": 0.8765122118237845,
636
+ "grad_norm": 0.09326171875,
637
+ "learning_rate": 1.6666666666666667e-05,
638
+ "loss": 0.1713,
639
+ "num_input_tokens_seen": 235929600,
640
+ "step": 7200
641
+ },
642
+ {
643
+ "epoch": 0.8765122118237845,
644
+ "eval_BLEU": 34.04522084811147,
645
+ "eval_chrF": 59.676716604524316,
646
+ "eval_loss": 0.14679720997810364,
647
+ "eval_runtime": 4014.0769,
648
+ "eval_samples_per_second": 4.701,
649
+ "eval_steps_per_second": 0.294,
650
+ "num_input_tokens_seen": 235929600,
651
+ "step": 7200
652
+ },
653
+ {
654
+ "epoch": 0.8886859925435593,
655
+ "grad_norm": 0.0869140625,
656
+ "learning_rate": 1.655211777204736e-05,
657
+ "loss": 0.1704,
658
+ "num_input_tokens_seen": 239206400,
659
+ "step": 7300
660
+ },
661
+ {
662
+ "epoch": 0.9008597732633341,
663
+ "grad_norm": 0.08740234375,
664
+ "learning_rate": 1.643989873053573e-05,
665
+ "loss": 0.1705,
666
+ "num_input_tokens_seen": 242483200,
667
+ "step": 7400
668
+ },
669
+ {
670
+ "epoch": 0.9130335539831089,
671
+ "grad_norm": 0.08984375,
672
+ "learning_rate": 1.6329931618554523e-05,
673
+ "loss": 0.1724,
674
+ "num_input_tokens_seen": 245760000,
675
+ "step": 7500
676
+ },
677
+ {
678
+ "epoch": 0.9252073347028836,
679
+ "grad_norm": 0.0908203125,
680
+ "learning_rate": 1.6222142113076256e-05,
681
+ "loss": 0.171,
682
+ "num_input_tokens_seen": 249036800,
683
+ "step": 7600
684
+ },
685
+ {
686
+ "epoch": 0.9373811154226585,
687
+ "grad_norm": 0.08349609375,
688
+ "learning_rate": 1.6116459280507607e-05,
689
+ "loss": 0.1719,
690
+ "num_input_tokens_seen": 252313600,
691
+ "step": 7700
692
+ },
693
+ {
694
+ "epoch": 0.9495548961424333,
695
+ "grad_norm": 0.09375,
696
+ "learning_rate": 1.6012815380508712e-05,
697
+ "loss": 0.1698,
698
+ "num_input_tokens_seen": 255590400,
699
+ "step": 7800
700
+ },
701
+ {
702
+ "epoch": 0.961728676862208,
703
+ "grad_norm": 0.09423828125,
704
+ "learning_rate": 1.59111456835146e-05,
705
+ "loss": 0.173,
706
+ "num_input_tokens_seen": 258867200,
707
+ "step": 7900
708
+ },
709
+ {
710
+ "epoch": 0.9739024575819828,
711
+ "grad_norm": 0.08251953125,
712
+ "learning_rate": 1.5811388300841898e-05,
713
+ "loss": 0.1695,
714
+ "num_input_tokens_seen": 262144000,
715
+ "step": 8000
716
+ },
717
+ {
718
+ "epoch": 0.9860762383017576,
719
+ "grad_norm": 0.09033203125,
720
+ "learning_rate": 1.5713484026367723e-05,
721
+ "loss": 0.1735,
722
+ "num_input_tokens_seen": 265420800,
723
+ "step": 8100
724
+ },
725
+ {
726
+ "epoch": 0.9982500190215323,
727
+ "grad_norm": 0.08935546875,
728
+ "learning_rate": 1.5617376188860607e-05,
729
+ "loss": 0.1714,
730
+ "num_input_tokens_seen": 268697600,
731
+ "step": 8200
732
+ },
733
+ {
734
+ "epoch": 1.0104237997413072,
735
+ "grad_norm": 0.09619140625,
736
+ "learning_rate": 1.5523010514126655e-05,
737
+ "loss": 0.1697,
738
+ "num_input_tokens_seen": 271972864,
739
+ "step": 8300
740
+ },
741
+ {
742
+ "epoch": 1.0225975804610818,
743
+ "grad_norm": 0.09375,
744
+ "learning_rate": 1.543033499620919e-05,
745
+ "loss": 0.171,
746
+ "num_input_tokens_seen": 275249664,
747
+ "step": 8400
748
+ },
749
+ {
750
+ "epoch": 1.0225975804610818,
751
+ "eval_BLEU": 34.158464778823294,
752
+ "eval_chrF": 59.68876956861537,
753
+ "eval_loss": 0.1467299610376358,
754
+ "eval_runtime": 3923.1304,
755
+ "eval_samples_per_second": 4.81,
756
+ "eval_steps_per_second": 0.301,
757
+ "num_input_tokens_seen": 275249664,
758
+ "step": 8400
759
+ },
760
+ {
761
+ "epoch": 1.0347713611808567,
762
+ "grad_norm": 0.0869140625,
763
+ "learning_rate": 1.5339299776947406e-05,
764
+ "loss": 0.1697,
765
+ "num_input_tokens_seen": 278526464,
766
+ "step": 8500
767
+ },
768
+ {
769
+ "epoch": 1.0469451419006315,
770
+ "grad_norm": 0.0888671875,
771
+ "learning_rate": 1.5249857033260467e-05,
772
+ "loss": 0.1679,
773
+ "num_input_tokens_seen": 281803264,
774
+ "step": 8600
775
+ },
776
+ {
777
+ "epoch": 1.0591189226204063,
778
+ "grad_norm": 0.080078125,
779
+ "learning_rate": 1.5161960871578069e-05,
780
+ "loss": 0.1718,
781
+ "num_input_tokens_seen": 285080064,
782
+ "step": 8700
783
+ },
784
+ {
785
+ "epoch": 1.0712927033401811,
786
+ "grad_norm": 0.08056640625,
787
+ "learning_rate": 1.5075567228888182e-05,
788
+ "loss": 0.1701,
789
+ "num_input_tokens_seen": 288356864,
790
+ "step": 8800
791
+ },
792
+ {
793
+ "epoch": 1.083466484059956,
794
+ "grad_norm": 0.0986328125,
795
+ "learning_rate": 1.499063377991723e-05,
796
+ "loss": 0.1711,
797
+ "num_input_tokens_seen": 291633664,
798
+ "step": 8900
799
+ },
800
+ {
801
+ "epoch": 1.0956402647797308,
802
+ "grad_norm": 0.0859375,
803
+ "learning_rate": 1.49071198499986e-05,
804
+ "loss": 0.1693,
805
+ "num_input_tokens_seen": 294910464,
806
+ "step": 9000
807
+ },
808
+ {
809
+ "epoch": 1.1078140454995054,
810
+ "grad_norm": 0.087890625,
811
+ "learning_rate": 1.4824986333222024e-05,
812
+ "loss": 0.17,
813
+ "num_input_tokens_seen": 298187264,
814
+ "step": 9100
815
+ },
816
+ {
817
+ "epoch": 1.1199878262192802,
818
+ "grad_norm": 0.08447265625,
819
+ "learning_rate": 1.4744195615489715e-05,
820
+ "loss": 0.1707,
821
+ "num_input_tokens_seen": 301464064,
822
+ "step": 9200
823
+ },
824
+ {
825
+ "epoch": 1.132161606939055,
826
+ "grad_norm": 0.08154296875,
827
+ "learning_rate": 1.4664711502135331e-05,
828
+ "loss": 0.1698,
829
+ "num_input_tokens_seen": 304740864,
830
+ "step": 9300
831
+ },
832
+ {
833
+ "epoch": 1.1443353876588298,
834
+ "grad_norm": 0.083984375,
835
+ "learning_rate": 1.4586499149789456e-05,
836
+ "loss": 0.1688,
837
+ "num_input_tokens_seen": 308017664,
838
+ "step": 9400
839
+ },
840
+ {
841
+ "epoch": 1.1565091683786046,
842
+ "grad_norm": 0.08837890625,
843
+ "learning_rate": 1.4509525002200233e-05,
844
+ "loss": 0.1674,
845
+ "num_input_tokens_seen": 311294464,
846
+ "step": 9500
847
+ },
848
+ {
849
+ "epoch": 1.1686829490983794,
850
+ "grad_norm": 0.08837890625,
851
+ "learning_rate": 1.4433756729740646e-05,
852
+ "loss": 0.1715,
853
+ "num_input_tokens_seen": 314571264,
854
+ "step": 9600
855
+ },
856
+ {
857
+ "epoch": 1.1686829490983794,
858
+ "eval_BLEU": 34.125660852650576,
859
+ "eval_chrF": 59.69577135277113,
860
+ "eval_loss": 0.14671051502227783,
861
+ "eval_runtime": 3968.6253,
862
+ "eval_samples_per_second": 4.755,
863
+ "eval_steps_per_second": 0.297,
864
+ "num_input_tokens_seen": 314571264,
865
+ "step": 9600
866
+ },
867
+ {
868
+ "epoch": 1.1808567298181543,
869
+ "grad_norm": 0.08642578125,
870
+ "learning_rate": 1.4359163172354764e-05,
871
+ "loss": 0.1693,
872
+ "num_input_tokens_seen": 317848064,
873
+ "step": 9700
874
+ },
875
+ {
876
+ "epoch": 1.1930305105379289,
877
+ "grad_norm": 0.09521484375,
878
+ "learning_rate": 1.4285714285714285e-05,
879
+ "loss": 0.1685,
880
+ "num_input_tokens_seen": 321124864,
881
+ "step": 9800
882
+ },
883
+ {
884
+ "epoch": 1.2052042912577037,
885
+ "grad_norm": 0.08740234375,
886
+ "learning_rate": 1.4213381090374031e-05,
887
+ "loss": 0.1686,
888
+ "num_input_tokens_seen": 324401664,
889
+ "step": 9900
890
+ },
891
+ {
892
+ "epoch": 1.2173780719774785,
893
+ "grad_norm": 0.087890625,
894
+ "learning_rate": 1.4142135623730951e-05,
895
+ "loss": 0.1711,
896
+ "num_input_tokens_seen": 327678464,
897
+ "step": 10000
898
+ },
899
+ {
900
+ "epoch": 1.2295518526972533,
901
+ "grad_norm": 0.08447265625,
902
+ "learning_rate": 1.4071950894605838e-05,
903
+ "loss": 0.1707,
904
+ "num_input_tokens_seen": 330955264,
905
+ "step": 10100
906
+ },
907
+ {
908
+ "epoch": 1.2417256334170281,
909
+ "grad_norm": 0.08740234375,
910
+ "learning_rate": 1.4002800840280098e-05,
911
+ "loss": 0.1704,
912
+ "num_input_tokens_seen": 334232064,
913
+ "step": 10200
914
+ },
915
+ {
916
+ "epoch": 1.2538994141368027,
917
+ "grad_norm": 0.0830078125,
918
+ "learning_rate": 1.3934660285832355e-05,
919
+ "loss": 0.1666,
920
+ "num_input_tokens_seen": 337508864,
921
+ "step": 10300
922
+ },
923
+ {
924
+ "epoch": 1.2660731948565775,
925
+ "grad_norm": 0.08544921875,
926
+ "learning_rate": 1.3867504905630729e-05,
927
+ "loss": 0.1688,
928
+ "num_input_tokens_seen": 340785664,
929
+ "step": 10400
930
+ },
931
+ {
932
+ "epoch": 1.2782469755763524,
933
+ "grad_norm": 0.09619140625,
934
+ "learning_rate": 1.3801311186847085e-05,
935
+ "loss": 0.1706,
936
+ "num_input_tokens_seen": 344062464,
937
+ "step": 10500
938
+ },
939
+ {
940
+ "epoch": 1.2904207562961272,
941
+ "grad_norm": 0.0859375,
942
+ "learning_rate": 1.3736056394868905e-05,
943
+ "loss": 0.1699,
944
+ "num_input_tokens_seen": 347339264,
945
+ "step": 10600
946
+ },
947
+ {
948
+ "epoch": 1.302594537015902,
949
+ "grad_norm": 0.0888671875,
950
+ "learning_rate": 1.3671718540493266e-05,
951
+ "loss": 0.1686,
952
+ "num_input_tokens_seen": 350616064,
953
+ "step": 10700
954
+ },
955
+ {
956
+ "epoch": 1.3147683177356768,
957
+ "grad_norm": 0.083984375,
958
+ "learning_rate": 1.3608276348795434e-05,
959
+ "loss": 0.1705,
960
+ "num_input_tokens_seen": 353892864,
961
+ "step": 10800
962
+ },
963
+ {
964
+ "epoch": 1.3147683177356768,
965
+ "eval_BLEU": 34.190278283000254,
966
+ "eval_chrF": 59.72105944050832,
967
+ "eval_loss": 0.14669395983219147,
968
+ "eval_runtime": 3965.5317,
969
+ "eval_samples_per_second": 4.759,
970
+ "eval_steps_per_second": 0.298,
971
+ "num_input_tokens_seen": 353892864,
972
+ "step": 10800
973
+ },
974
+ {
975
+ "epoch": 1.3269420984554516,
976
+ "grad_norm": 0.08251953125,
977
+ "learning_rate": 1.3545709229571929e-05,
978
+ "loss": 0.1702,
979
+ "num_input_tokens_seen": 357169664,
980
+ "step": 10900
981
+ },
982
+ {
983
+ "epoch": 1.3391158791752265,
984
+ "grad_norm": 0.08544921875,
985
+ "learning_rate": 1.3483997249264842e-05,
986
+ "loss": 0.1693,
987
+ "num_input_tokens_seen": 360446464,
988
+ "step": 11000
989
+ },
990
+ {
991
+ "epoch": 1.3512896598950013,
992
+ "grad_norm": 0.08544921875,
993
+ "learning_rate": 1.3423121104280487e-05,
994
+ "loss": 0.1711,
995
+ "num_input_tokens_seen": 363723264,
996
+ "step": 11100
997
+ },
998
+ {
999
+ "epoch": 1.3634634406147759,
1000
+ "grad_norm": 0.0849609375,
1001
+ "learning_rate": 1.3363062095621221e-05,
1002
+ "loss": 0.1687,
1003
+ "num_input_tokens_seen": 367000064,
1004
+ "step": 11200
1005
+ },
1006
+ {
1007
+ "epoch": 1.3756372213345507,
1008
+ "grad_norm": 0.09033203125,
1009
+ "learning_rate": 1.3303802104754787e-05,
1010
+ "loss": 0.1701,
1011
+ "num_input_tokens_seen": 370276864,
1012
+ "step": 11300
1013
+ },
1014
+ {
1015
+ "epoch": 1.3878110020543255,
1016
+ "grad_norm": 0.10107421875,
1017
+ "learning_rate": 1.324532357065044e-05,
1018
+ "loss": 0.1706,
1019
+ "num_input_tokens_seen": 373553664,
1020
+ "step": 11400
1021
+ },
1022
+ {
1023
+ "epoch": 1.3999847827741003,
1024
+ "grad_norm": 0.08544921875,
1025
+ "learning_rate": 1.318760946791574e-05,
1026
+ "loss": 0.1691,
1027
+ "num_input_tokens_seen": 376830464,
1028
+ "step": 11500
1029
+ },
1030
+ {
1031
+ "epoch": 1.4121585634938751,
1032
+ "grad_norm": 0.09130859375,
1033
+ "learning_rate": 1.3130643285972255e-05,
1034
+ "loss": 0.1699,
1035
+ "num_input_tokens_seen": 380107264,
1036
+ "step": 11600
1037
+ },
1038
+ {
1039
+ "epoch": 1.4243323442136497,
1040
+ "grad_norm": 0.095703125,
1041
+ "learning_rate": 1.3074409009212269e-05,
1042
+ "loss": 0.1698,
1043
+ "num_input_tokens_seen": 383384064,
1044
+ "step": 11700
1045
+ },
1046
+ {
1047
+ "epoch": 1.4365061249334246,
1048
+ "grad_norm": 0.0849609375,
1049
+ "learning_rate": 1.3018891098082389e-05,
1050
+ "loss": 0.1692,
1051
+ "num_input_tokens_seen": 386660864,
1052
+ "step": 11800
1053
+ },
1054
+ {
1055
+ "epoch": 1.4486799056531994,
1056
+ "grad_norm": 0.08642578125,
1057
+ "learning_rate": 1.2964074471043288e-05,
1058
+ "loss": 0.1691,
1059
+ "num_input_tokens_seen": 389937664,
1060
+ "step": 11900
1061
+ },
1062
+ {
1063
+ "epoch": 1.4608536863729742,
1064
+ "grad_norm": 0.0849609375,
1065
+ "learning_rate": 1.2909944487358055e-05,
1066
+ "loss": 0.1709,
1067
+ "num_input_tokens_seen": 393214464,
1068
+ "step": 12000
1069
+ },
1070
+ {
1071
+ "epoch": 1.4608536863729742,
1072
+ "eval_BLEU": 34.19371467731715,
1073
+ "eval_chrF": 59.7146095038311,
1074
+ "eval_loss": 0.14666913449764252,
1075
+ "eval_runtime": 3982.0054,
1076
+ "eval_samples_per_second": 4.739,
1077
+ "eval_steps_per_second": 0.296,
1078
+ "num_input_tokens_seen": 393214464,
1079
+ "step": 12000
1080
+ },
1081
+ {
1082
+ "epoch": 1.473027467092749,
1083
+ "grad_norm": 0.08837890625,
1084
+ "learning_rate": 1.2856486930664503e-05,
1085
+ "loss": 0.1719,
1086
+ "num_input_tokens_seen": 396491264,
1087
+ "step": 12100
1088
+ },
1089
+ {
1090
+ "epoch": 1.4852012478125238,
1091
+ "grad_norm": 0.08447265625,
1092
+ "learning_rate": 1.2803687993289598e-05,
1093
+ "loss": 0.1677,
1094
+ "num_input_tokens_seen": 399768064,
1095
+ "step": 12200
1096
+ },
1097
+ {
1098
+ "epoch": 1.4973750285322986,
1099
+ "grad_norm": 0.0830078125,
1100
+ "learning_rate": 1.2751534261266765e-05,
1101
+ "loss": 0.1696,
1102
+ "num_input_tokens_seen": 403044864,
1103
+ "step": 12300
1104
+ },
1105
+ {
1106
+ "epoch": 1.5095488092520735,
1107
+ "grad_norm": 0.08447265625,
1108
+ "learning_rate": 1.2700012700019051e-05,
1109
+ "loss": 0.1692,
1110
+ "num_input_tokens_seen": 406321664,
1111
+ "step": 12400
1112
+ },
1113
+ {
1114
+ "epoch": 1.5217225899718483,
1115
+ "grad_norm": 0.08740234375,
1116
+ "learning_rate": 1.2649110640673517e-05,
1117
+ "loss": 0.1718,
1118
+ "num_input_tokens_seen": 409598464,
1119
+ "step": 12500
1120
+ },
1121
+ {
1122
+ "epoch": 1.5338963706916229,
1123
+ "grad_norm": 0.08544921875,
1124
+ "learning_rate": 1.2598815766974239e-05,
1125
+ "loss": 0.1707,
1126
+ "num_input_tokens_seen": 412875264,
1127
+ "step": 12600
1128
+ },
1129
+ {
1130
+ "epoch": 1.5460701514113977,
1131
+ "grad_norm": 0.09521484375,
1132
+ "learning_rate": 1.2549116102763172e-05,
1133
+ "loss": 0.1678,
1134
+ "num_input_tokens_seen": 416152064,
1135
+ "step": 12700
1136
+ },
1137
+ {
1138
+ "epoch": 1.5582439321311725,
1139
+ "grad_norm": 0.07958984375,
1140
+ "learning_rate": 1.25e-05,
1141
+ "loss": 0.1668,
1142
+ "num_input_tokens_seen": 419428864,
1143
+ "step": 12800
1144
+ },
1145
+ {
1146
+ "epoch": 1.5704177128509471,
1147
+ "grad_norm": 0.08740234375,
1148
+ "learning_rate": 1.2451456127293808e-05,
1149
+ "loss": 0.169,
1150
+ "num_input_tokens_seen": 422705664,
1151
+ "step": 12900
1152
+ },
1153
+ {
1154
+ "epoch": 1.582591493570722,
1155
+ "grad_norm": 0.08642578125,
1156
+ "learning_rate": 1.2403473458920847e-05,
1157
+ "loss": 0.1677,
1158
+ "num_input_tokens_seen": 425982464,
1159
+ "step": 13000
1160
+ },
1161
+ {
1162
+ "epoch": 1.5947652742904967,
1163
+ "grad_norm": 0.0830078125,
1164
+ "learning_rate": 1.2356041264304309e-05,
1165
+ "loss": 0.168,
1166
+ "num_input_tokens_seen": 429259264,
1167
+ "step": 13100
1168
+ },
1169
+ {
1170
+ "epoch": 1.6069390550102716,
1171
+ "grad_norm": 0.09765625,
1172
+ "learning_rate": 1.2309149097933273e-05,
1173
+ "loss": 0.1699,
1174
+ "num_input_tokens_seen": 432536064,
1175
+ "step": 13200
1176
+ },
1177
+ {
1178
+ "epoch": 1.6069390550102716,
1179
+ "eval_BLEU": 34.185916767080265,
1180
+ "eval_chrF": 59.706613240927844,
1181
+ "eval_loss": 0.14664307236671448,
1182
+ "eval_runtime": 3971.3932,
1183
+ "eval_samples_per_second": 4.751,
1184
+ "eval_steps_per_second": 0.297,
1185
+ "num_input_tokens_seen": 432536064,
1186
+ "step": 13200
1187
+ },
1188
+ {
1189
+ "epoch": 1.6191128357300464,
1190
+ "grad_norm": 0.08447265625,
1191
+ "learning_rate": 1.2262786789699317e-05,
1192
+ "loss": 0.1686,
1193
+ "num_input_tokens_seen": 435812864,
1194
+ "step": 13300
1195
+ },
1196
+ {
1197
+ "epoch": 1.6312866164498212,
1198
+ "grad_norm": 0.09326171875,
1199
+ "learning_rate": 1.2216944435630522e-05,
1200
+ "loss": 0.1662,
1201
+ "num_input_tokens_seen": 439089664,
1202
+ "step": 13400
1203
+ },
1204
+ {
1205
+ "epoch": 1.643460397169596,
1206
+ "grad_norm": 0.091796875,
1207
+ "learning_rate": 1.2171612389003693e-05,
1208
+ "loss": 0.1704,
1209
+ "num_input_tokens_seen": 442366464,
1210
+ "step": 13500
1211
+ },
1212
+ {
1213
+ "epoch": 1.6556341778893708,
1214
+ "grad_norm": 0.09716796875,
1215
+ "learning_rate": 1.212678125181665e-05,
1216
+ "loss": 0.1717,
1217
+ "num_input_tokens_seen": 445643264,
1218
+ "step": 13600
1219
+ },
1220
+ {
1221
+ "epoch": 1.6678079586091457,
1222
+ "grad_norm": 0.09619140625,
1223
+ "learning_rate": 1.208244186660354e-05,
1224
+ "loss": 0.17,
1225
+ "num_input_tokens_seen": 448920064,
1226
+ "step": 13700
1227
+ },
1228
+ {
1229
+ "epoch": 1.6799817393289205,
1230
+ "grad_norm": 0.07958984375,
1231
+ "learning_rate": 1.203858530857692e-05,
1232
+ "loss": 0.1699,
1233
+ "num_input_tokens_seen": 452196864,
1234
+ "step": 13800
1235
+ },
1236
+ {
1237
+ "epoch": 1.692155520048695,
1238
+ "grad_norm": 0.09619140625,
1239
+ "learning_rate": 1.1995202878081345e-05,
1240
+ "loss": 0.1703,
1241
+ "num_input_tokens_seen": 455473664,
1242
+ "step": 13900
1243
+ },
1244
+ {
1245
+ "epoch": 1.70432930076847,
1246
+ "grad_norm": 0.0908203125,
1247
+ "learning_rate": 1.1952286093343936e-05,
1248
+ "loss": 0.173,
1249
+ "num_input_tokens_seen": 458750464,
1250
+ "step": 14000
1251
+ },
1252
+ {
1253
+ "epoch": 1.7165030814882447,
1254
+ "grad_norm": 0.0908203125,
1255
+ "learning_rate": 1.1909826683508273e-05,
1256
+ "loss": 0.1694,
1257
+ "num_input_tokens_seen": 462027264,
1258
+ "step": 14100
1259
+ },
1260
+ {
1261
+ "epoch": 1.7286768622080195,
1262
+ "grad_norm": 0.080078125,
1263
+ "learning_rate": 1.1867816581938534e-05,
1264
+ "loss": 0.1698,
1265
+ "num_input_tokens_seen": 465304064,
1266
+ "step": 14200
1267
+ },
1268
+ {
1269
+ "epoch": 1.7408506429277941,
1270
+ "grad_norm": 0.08251953125,
1271
+ "learning_rate": 1.1826247919781652e-05,
1272
+ "loss": 0.1706,
1273
+ "num_input_tokens_seen": 468580864,
1274
+ "step": 14300
1275
+ },
1276
+ {
1277
+ "epoch": 1.753024423647569,
1278
+ "grad_norm": 0.08544921875,
1279
+ "learning_rate": 1.1785113019775794e-05,
1280
+ "loss": 0.1699,
1281
+ "num_input_tokens_seen": 471857664,
1282
+ "step": 14400
1283
+ },
1284
+ {
1285
+ "epoch": 1.753024423647569,
1286
+ "eval_BLEU": 34.102099613409436,
1287
+ "eval_chrF": 59.716548967741204,
1288
+ "eval_loss": 0.1466248631477356,
1289
+ "eval_runtime": 3984.4468,
1290
+ "eval_samples_per_second": 4.736,
1291
+ "eval_steps_per_second": 0.296,
1292
+ "num_input_tokens_seen": 471857664,
1293
+ "step": 14400
1294
+ },
1295
+ {
1296
+ "epoch": 1.7651982043673438,
1297
+ "grad_norm": 0.0859375,
1298
+ "learning_rate": 1.174440439029407e-05,
1299
+ "loss": 0.1677,
1300
+ "num_input_tokens_seen": 475134464,
1301
+ "step": 14500
1302
+ },
1303
+ {
1304
+ "epoch": 1.7773719850871186,
1305
+ "grad_norm": 0.07958984375,
1306
+ "learning_rate": 1.1704114719613058e-05,
1307
+ "loss": 0.1724,
1308
+ "num_input_tokens_seen": 478411264,
1309
+ "step": 14600
1310
+ },
1311
+ {
1312
+ "epoch": 1.7895457658068934,
1313
+ "grad_norm": 0.0947265625,
1314
+ "learning_rate": 1.1664236870396087e-05,
1315
+ "loss": 0.1684,
1316
+ "num_input_tokens_seen": 481688064,
1317
+ "step": 14700
1318
+ },
1319
+ {
1320
+ "epoch": 1.8017195465266682,
1321
+ "grad_norm": 0.08740234375,
1322
+ "learning_rate": 1.162476387438193e-05,
1323
+ "loss": 0.1689,
1324
+ "num_input_tokens_seen": 484964864,
1325
+ "step": 14800
1326
+ },
1327
+ {
1328
+ "epoch": 1.813893327246443,
1329
+ "grad_norm": 0.09033203125,
1330
+ "learning_rate": 1.1585688927269846e-05,
1331
+ "loss": 0.1681,
1332
+ "num_input_tokens_seen": 488241664,
1333
+ "step": 14900
1334
+ },
1335
+ {
1336
+ "epoch": 1.8260671079662179,
1337
+ "grad_norm": 0.0830078125,
1338
+ "learning_rate": 1.1547005383792514e-05,
1339
+ "loss": 0.1682,
1340
+ "num_input_tokens_seen": 491518464,
1341
+ "step": 15000
1342
+ },
1343
+ {
1344
+ "epoch": 1.8382408886859927,
1345
+ "grad_norm": 0.09423828125,
1346
+ "learning_rate": 1.150870675296872e-05,
1347
+ "loss": 0.1725,
1348
+ "num_input_tokens_seen": 494795264,
1349
+ "step": 15100
1350
+ },
1351
+ {
1352
+ "epoch": 1.8504146694057675,
1353
+ "grad_norm": 0.08203125,
1354
+ "learning_rate": 1.1470786693528087e-05,
1355
+ "loss": 0.167,
1356
+ "num_input_tokens_seen": 498072064,
1357
+ "step": 15200
1358
+ },
1359
+ {
1360
+ "epoch": 1.862588450125542,
1361
+ "grad_norm": 0.09130859375,
1362
+ "learning_rate": 1.143323900950059e-05,
1363
+ "loss": 0.1695,
1364
+ "num_input_tokens_seen": 501348864,
1365
+ "step": 15300
1366
+ },
1367
+ {
1368
+ "epoch": 1.874762230845317,
1369
+ "grad_norm": 0.08349609375,
1370
+ "learning_rate": 1.1396057645963795e-05,
1371
+ "loss": 0.1696,
1372
+ "num_input_tokens_seen": 504625664,
1373
+ "step": 15400
1374
+ },
1375
+ {
1376
+ "epoch": 1.8869360115650917,
1377
+ "grad_norm": 0.0859375,
1378
+ "learning_rate": 1.1359236684941297e-05,
1379
+ "loss": 0.1679,
1380
+ "num_input_tokens_seen": 507902464,
1381
+ "step": 15500
1382
+ },
1383
+ {
1384
+ "epoch": 1.8991097922848663,
1385
+ "grad_norm": 0.0869140625,
1386
+ "learning_rate": 1.1322770341445956e-05,
1387
+ "loss": 0.17,
1388
+ "num_input_tokens_seen": 511179264,
1389
+ "step": 15600
1390
+ },
1391
+ {
1392
+ "epoch": 1.8991097922848663,
1393
+ "eval_BLEU": 34.13689431537213,
1394
+ "eval_chrF": 59.7127572353836,
1395
+ "eval_loss": 0.14662402868270874,
1396
+ "eval_runtime": 3972.3482,
1397
+ "eval_samples_per_second": 4.75,
1398
+ "eval_steps_per_second": 0.297,
1399
+ "num_input_tokens_seen": 511179264,
1400
+ "step": 15600
1401
+ },
1402
+ {
1403
+ "epoch": 1.9112835730046411,
1404
+ "grad_norm": 0.09521484375,
1405
+ "learning_rate": 1.1286652959662007e-05,
1406
+ "loss": 0.1685,
1407
+ "num_input_tokens_seen": 514456064,
1408
+ "step": 15700
1409
+ },
1410
+ {
1411
+ "epoch": 1.923457353724416,
1412
+ "grad_norm": 0.08740234375,
1413
+ "learning_rate": 1.125087900926024e-05,
1414
+ "loss": 0.1713,
1415
+ "num_input_tokens_seen": 517732864,
1416
+ "step": 15800
1417
+ },
1418
+ {
1419
+ "epoch": 1.9356311344441908,
1420
+ "grad_norm": 0.08544921875,
1421
+ "learning_rate": 1.1215443081840888e-05,
1422
+ "loss": 0.169,
1423
+ "num_input_tokens_seen": 521009664,
1424
+ "step": 15900
1425
+ },
1426
+ {
1427
+ "epoch": 1.9478049151639656,
1428
+ "grad_norm": 0.0908203125,
1429
+ "learning_rate": 1.1180339887498949e-05,
1430
+ "loss": 0.1717,
1431
+ "num_input_tokens_seen": 524286464,
1432
+ "step": 16000
1433
+ },
1434
+ {
1435
+ "epoch": 1.9599786958837404,
1436
+ "grad_norm": 0.09130859375,
1437
+ "learning_rate": 1.1145564251507057e-05,
1438
+ "loss": 0.1707,
1439
+ "num_input_tokens_seen": 527563264,
1440
+ "step": 16100
1441
+ },
1442
+ {
1443
+ "epoch": 1.9721524766035152,
1444
+ "grad_norm": 0.08642578125,
1445
+ "learning_rate": 1.1111111111111112e-05,
1446
+ "loss": 0.1659,
1447
+ "num_input_tokens_seen": 530840064,
1448
+ "step": 16200
1449
+ },
1450
+ {
1451
+ "epoch": 1.98432625732329,
1452
+ "grad_norm": 0.09619140625,
1453
+ "learning_rate": 1.1076975512434226e-05,
1454
+ "loss": 0.168,
1455
+ "num_input_tokens_seen": 534116864,
1456
+ "step": 16300
1457
+ },
1458
+ {
1459
+ "epoch": 1.9965000380430649,
1460
+ "grad_norm": 0.08447265625,
1461
+ "learning_rate": 1.1043152607484654e-05,
1462
+ "loss": 0.1684,
1463
+ "num_input_tokens_seen": 537393664,
1464
+ "step": 16400
1465
+ }
1466
+ ],
1467
+ "logging_steps": 100,
1468
+ "max_steps": 16428,
1469
+ "num_input_tokens_seen": 538311168,
1470
+ "num_train_epochs": 2,
1471
+ "save_steps": 1200,
1472
+ "stateful_callbacks": {
1473
+ "EarlyStoppingCallback": {
1474
+ "args": {
1475
+ "early_stopping_patience": 5,
1476
+ "early_stopping_threshold": 0.001
1477
+ },
1478
+ "attributes": {
1479
+ "early_stopping_patience_counter": 0
1480
+ }
1481
+ },
1482
+ "TrainerControl": {
1483
+ "args": {
1484
+ "should_epoch_stop": false,
1485
+ "should_evaluate": false,
1486
+ "should_log": false,
1487
+ "should_save": true,
1488
+ "should_training_stop": true
1489
+ },
1490
+ "attributes": {}
1491
+ }
1492
+ },
1493
+ "total_flos": 3.1036059771249623e+18,
1494
+ "train_batch_size": 16,
1495
+ "trial_name": null,
1496
+ "trial_params": null
1497
+ }