alexue4 commited on
Commit
407d7de
1 Parent(s): 1ce4365

End of training

Browse files
Files changed (4) hide show
  1. README.md +65 -35
  2. pytorch_model.bin +1 -1
  3. trainer_state.json +1368 -1068
  4. training_args.bin +1 -1
README.md CHANGED
@@ -15,9 +15,9 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [cointegrated/rut5-small](https://huggingface.co/cointegrated/rut5-small) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0985
19
  - Mean Distance: 0
20
- - Max Distance: 9
21
 
22
  ## Model description
23
 
@@ -43,42 +43,72 @@ The following hyperparameters were used during training:
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
- - num_epochs: 30
47
 
48
  ### Training results
49
 
50
- | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
- |:-------------:|:-----:|:-----:|:---------------:|:-------------:|:------------:|
52
- | 0.181 | 1.0 | 2598 | 0.2140 | 4 | 36 |
53
- | 0.1067 | 2.0 | 5196 | 0.1416 | 2 | 29 |
54
- | 0.0801 | 3.0 | 7794 | 0.1098 | 2 | 22 |
55
- | 0.0575 | 4.0 | 10392 | 0.1081 | 2 | 18 |
56
- | 0.0452 | 5.0 | 12990 | 0.0897 | 1 | 14 |
57
- | 0.0372 | 6.0 | 15588 | 0.0720 | 1 | 15 |
58
- | 0.0323 | 7.0 | 18186 | 0.0840 | 1 | 12 |
59
- | 0.0267 | 8.0 | 20784 | 0.0768 | 1 | 16 |
60
- | 0.0231 | 9.0 | 23382 | 0.0697 | 1 | 10 |
61
- | 0.0199 | 10.0 | 25980 | 0.0717 | 1 | 9 |
62
- | 0.0168 | 11.0 | 28578 | 0.0812 | 1 | 16 |
63
- | 0.0148 | 12.0 | 31176 | 0.0961 | 1 | 12 |
64
- | 0.0128 | 13.0 | 33774 | 0.0823 | 1 | 9 |
65
- | 0.0112 | 14.0 | 36372 | 0.0766 | 1 | 12 |
66
- | 0.0093 | 15.0 | 38970 | 0.0713 | 1 | 9 |
67
- | 0.0083 | 16.0 | 41568 | 0.0847 | 1 | 14 |
68
- | 0.0076 | 17.0 | 44166 | 0.0863 | 1 | 11 |
69
- | 0.0064 | 18.0 | 46764 | 0.0830 | 1 | 14 |
70
- | 0.0054 | 19.0 | 49362 | 0.0884 | 1 | 11 |
71
- | 0.0052 | 20.0 | 51960 | 0.0821 | 1 | 10 |
72
- | 0.0045 | 21.0 | 54558 | 0.0915 | 1 | 14 |
73
- | 0.0037 | 22.0 | 57156 | 0.0931 | 1 | 14 |
74
- | 0.0036 | 23.0 | 59754 | 0.0941 | 1 | 9 |
75
- | 0.0028 | 24.0 | 62352 | 0.0861 | 1 | 13 |
76
- | 0.0026 | 25.0 | 64950 | 0.0912 | 1 | 12 |
77
- | 0.0024 | 26.0 | 67548 | 0.0916 | 0 | 9 |
78
- | 0.002 | 27.0 | 70146 | 0.0888 | 0 | 9 |
79
- | 0.0017 | 28.0 | 72744 | 0.0888 | 0 | 9 |
80
- | 0.0017 | 29.0 | 75342 | 0.0952 | 0 | 9 |
81
- | 0.0014 | 30.0 | 77940 | 0.0985 | 0 | 9 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [cointegrated/rut5-small](https://huggingface.co/cointegrated/rut5-small) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0177
19
  - Mean Distance: 0
20
+ - Max Distance: 15
21
 
22
  ## Model description
23
 
 
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 60
47
 
48
  ### Training results
49
 
50
+ | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
+ |:-------------:|:-----:|:------:|:---------------:|:-------------:|:------------:|
52
+ | 0.2236 | 1.0 | 3298 | 0.1120 | 5 | 133 |
53
+ | 0.1179 | 2.0 | 6596 | 0.0548 | 3 | 82 |
54
+ | 0.0829 | 3.0 | 9894 | 0.0425 | 1 | 46 |
55
+ | 0.0643 | 4.0 | 13192 | 0.0311 | 1 | 64 |
56
+ | 0.0538 | 5.0 | 16490 | 0.0267 | 1 | 48 |
57
+ | 0.0469 | 6.0 | 19788 | 0.0396 | 2 | 80 |
58
+ | 0.0385 | 7.0 | 23086 | 0.0262 | 2 | 73 |
59
+ | 0.0316 | 8.0 | 26384 | 0.0223 | 1 | 40 |
60
+ | 0.0263 | 9.0 | 29682 | 0.0240 | 1 | 69 |
61
+ | 0.0226 | 10.0 | 32980 | 0.0203 | 1 | 60 |
62
+ | 0.0203 | 11.0 | 36278 | 0.0177 | 1 | 54 |
63
+ | 0.0178 | 12.0 | 39576 | 0.0188 | 1 | 61 |
64
+ | 0.0154 | 13.0 | 42874 | 0.0296 | 1 | 65 |
65
+ | 0.0138 | 14.0 | 46172 | 0.0201 | 1 | 55 |
66
+ | 0.012 | 15.0 | 49470 | 0.0268 | 1 | 67 |
67
+ | 0.0109 | 16.0 | 52768 | 0.0163 | 1 | 35 |
68
+ | 0.0105 | 17.0 | 56066 | 0.0136 | 1 | 26 |
69
+ | 0.0092 | 18.0 | 59364 | 0.0202 | 1 | 65 |
70
+ | 0.0087 | 19.0 | 62662 | 0.0221 | 1 | 65 |
71
+ | 0.0075 | 20.0 | 65960 | 0.0203 | 1 | 33 |
72
+ | 0.0067 | 21.0 | 69258 | 0.0226 | 1 | 26 |
73
+ | 0.0062 | 22.0 | 72556 | 0.0184 | 1 | 24 |
74
+ | 0.0059 | 23.0 | 75854 | 0.0131 | 0 | 18 |
75
+ | 0.0054 | 24.0 | 79152 | 0.0270 | 1 | 58 |
76
+ | 0.0052 | 25.0 | 82450 | 0.0244 | 1 | 45 |
77
+ | 0.0044 | 26.0 | 85748 | 0.0149 | 1 | 23 |
78
+ | 0.0043 | 27.0 | 89046 | 0.0256 | 1 | 63 |
79
+ | 0.0038 | 28.0 | 92344 | 0.0172 | 1 | 30 |
80
+ | 0.0036 | 29.0 | 95642 | 0.0224 | 1 | 37 |
81
+ | 0.0033 | 30.0 | 98940 | 0.0194 | 1 | 30 |
82
+ | 0.0031 | 31.0 | 102238 | 0.0238 | 1 | 59 |
83
+ | 0.003 | 32.0 | 105536 | 0.0200 | 1 | 28 |
84
+ | 0.0028 | 33.0 | 108834 | 0.0161 | 0 | 18 |
85
+ | 0.0027 | 34.0 | 112132 | 0.0215 | 1 | 26 |
86
+ | 0.0025 | 35.0 | 115430 | 0.0198 | 0 | 19 |
87
+ | 0.0023 | 36.0 | 118728 | 0.0168 | 0 | 24 |
88
+ | 0.002 | 37.0 | 122026 | 0.0221 | 1 | 32 |
89
+ | 0.0019 | 38.0 | 125324 | 0.0214 | 1 | 32 |
90
+ | 0.0017 | 39.0 | 128622 | 0.0186 | 0 | 19 |
91
+ | 0.0017 | 40.0 | 131920 | 0.0171 | 0 | 23 |
92
+ | 0.0016 | 41.0 | 135218 | 0.0164 | 0 | 17 |
93
+ | 0.0015 | 42.0 | 138516 | 0.0166 | 1 | 21 |
94
+ | 0.0014 | 43.0 | 141814 | 0.0167 | 0 | 21 |
95
+ | 0.0019 | 44.0 | 145112 | 0.0192 | 1 | 32 |
96
+ | 0.0011 | 45.0 | 148410 | 0.0209 | 1 | 27 |
97
+ | 0.0011 | 46.0 | 151708 | 0.0218 | 0 | 23 |
98
+ | 0.001 | 47.0 | 155006 | 0.0195 | 0 | 25 |
99
+ | 0.0009 | 48.0 | 158304 | 0.0166 | 0 | 15 |
100
+ | 0.0008 | 49.0 | 161602 | 0.0210 | 1 | 31 |
101
+ | 0.0008 | 50.0 | 164900 | 0.0230 | 0 | 22 |
102
+ | 0.0008 | 51.0 | 168198 | 0.0184 | 0 | 15 |
103
+ | 0.0007 | 52.0 | 171496 | 0.0183 | 0 | 15 |
104
+ | 0.0006 | 53.0 | 174794 | 0.0234 | 1 | 32 |
105
+ | 0.0005 | 54.0 | 178092 | 0.0227 | 0 | 24 |
106
+ | 0.0004 | 55.0 | 181390 | 0.0188 | 0 | 15 |
107
+ | 0.0005 | 56.0 | 184688 | 0.0191 | 0 | 15 |
108
+ | 0.0004 | 57.0 | 187986 | 0.0183 | 0 | 15 |
109
+ | 0.0003 | 58.0 | 191284 | 0.0180 | 0 | 15 |
110
+ | 0.0003 | 59.0 | 194582 | 0.0180 | 0 | 15 |
111
+ | 0.0004 | 60.0 | 197880 | 0.0177 | 0 | 15 |
112
 
113
 
114
  ### Framework versions
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18151e9614a6f5186bbe2400883a83025ffebc2e1a09f7899f724917c104504e
3
  size 258643461
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67c7f076dd09ccc14ee16c69cabf6f1ca5b674bd9bd1bf502d509b46230e8f17
3
  size 258643461
trainer_state.json CHANGED
@@ -1,1528 +1,1828 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 30.0,
5
  "eval_steps": 500,
6
- "global_step": 77940,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.2830382345393893e-07,
14
- "loss": 13.6904,
15
  "step": 1
16
  },
17
- {
18
- "epoch": 0.15,
19
- "learning_rate": 5.003849114703618e-05,
20
- "loss": 5.273,
21
- "step": 390
22
- },
23
  {
24
  "epoch": 0.3,
25
- "learning_rate": 0.00010007698229407236,
26
- "loss": 0.4624,
27
- "step": 780
28
- },
29
- {
30
- "epoch": 0.45,
31
- "learning_rate": 0.00015011547344110854,
32
- "loss": 0.3258,
33
- "step": 1170
34
  },
35
  {
36
  "epoch": 0.6,
37
- "learning_rate": 0.00020015396458814472,
38
- "loss": 0.2553,
39
- "step": 1560
40
- },
41
- {
42
- "epoch": 0.75,
43
- "learning_rate": 0.0002501924557351809,
44
- "loss": 0.2143,
45
- "step": 1950
46
  },
47
  {
48
  "epoch": 0.9,
49
- "learning_rate": 0.0003002309468822171,
50
- "loss": 0.181,
51
- "step": 2340
52
  },
53
  {
54
  "epoch": 1.0,
55
- "eval_loss": 0.21401263773441315,
56
- "eval_max_distance": 36,
57
- "eval_mean_distance": 4,
58
- "eval_runtime": 0.4225,
59
- "eval_samples_per_second": 118.338,
60
- "eval_steps_per_second": 4.734,
61
- "step": 2598
62
- },
63
- {
64
- "epoch": 1.05,
65
- "learning_rate": 0.0003502694380292533,
66
- "loss": 0.1557,
67
- "step": 2730
68
  },
69
  {
70
  "epoch": 1.2,
71
- "learning_rate": 0.00040030792917628943,
72
- "loss": 0.1402,
73
- "step": 3120
74
- },
75
- {
76
- "epoch": 1.35,
77
- "learning_rate": 0.00045034642032332564,
78
- "loss": 0.1283,
79
- "step": 3510
80
  },
81
  {
82
  "epoch": 1.5,
83
- "learning_rate": 0.0005003849114703618,
84
- "loss": 0.1194,
85
- "step": 3900
86
- },
87
- {
88
- "epoch": 1.65,
89
- "learning_rate": 0.0005504234026173979,
90
- "loss": 0.1158,
91
- "step": 4290
92
  },
93
  {
94
  "epoch": 1.8,
95
- "learning_rate": 0.0006004618937644341,
96
- "loss": 0.108,
97
- "step": 4680
98
- },
99
- {
100
- "epoch": 1.95,
101
- "learning_rate": 0.0006505003849114704,
102
- "loss": 0.1067,
103
- "step": 5070
104
  },
105
  {
106
  "epoch": 2.0,
107
- "eval_loss": 0.14157189428806305,
108
- "eval_max_distance": 29,
109
- "eval_mean_distance": 2,
110
- "eval_runtime": 0.4066,
111
- "eval_samples_per_second": 122.96,
112
- "eval_steps_per_second": 4.918,
113
- "step": 5196
114
  },
115
  {
116
  "epoch": 2.1,
117
- "learning_rate": 0.0007005388760585066,
118
- "loss": 0.0924,
119
- "step": 5460
120
- },
121
- {
122
- "epoch": 2.25,
123
- "learning_rate": 0.0007505773672055427,
124
- "loss": 0.0927,
125
- "step": 5850
126
  },
127
  {
128
  "epoch": 2.4,
129
- "learning_rate": 0.0008006158583525789,
130
- "loss": 0.0872,
131
- "step": 6240
132
- },
133
- {
134
- "epoch": 2.55,
135
- "learning_rate": 0.0008506543494996151,
136
- "loss": 0.0841,
137
- "step": 6630
138
  },
139
  {
140
  "epoch": 2.7,
141
- "learning_rate": 0.0009006928406466513,
142
- "loss": 0.0808,
143
- "step": 7020
144
- },
145
- {
146
- "epoch": 2.85,
147
- "learning_rate": 0.0009507313317936874,
148
- "loss": 0.0801,
149
- "step": 7410
150
  },
151
  {
152
  "epoch": 3.0,
153
- "eval_loss": 0.10980188101530075,
154
- "eval_max_distance": 22,
155
- "eval_mean_distance": 2,
156
- "eval_runtime": 0.4052,
157
- "eval_samples_per_second": 123.401,
158
- "eval_steps_per_second": 4.936,
159
- "step": 7794
160
  },
161
  {
162
  "epoch": 3.0,
163
- "learning_rate": 0.0009999144641176974,
164
- "loss": 0.0778,
165
- "step": 7800
166
- },
167
- {
168
- "epoch": 3.15,
169
- "learning_rate": 0.0009943546317680268,
170
- "loss": 0.0684,
171
- "step": 8190
172
  },
173
  {
174
  "epoch": 3.3,
175
- "learning_rate": 0.000988794799418356,
176
- "loss": 0.0692,
177
- "step": 8580
178
- },
179
- {
180
- "epoch": 3.45,
181
- "learning_rate": 0.0009832349670686853,
182
- "loss": 0.0629,
183
- "step": 8970
184
  },
185
  {
186
  "epoch": 3.6,
187
- "learning_rate": 0.0009776751347190145,
188
  "loss": 0.0653,
189
- "step": 9360
190
- },
191
- {
192
- "epoch": 3.75,
193
- "learning_rate": 0.0009721153023693439,
194
- "loss": 0.0595,
195
- "step": 9750
196
  },
197
  {
198
  "epoch": 3.9,
199
- "learning_rate": 0.0009665554700196733,
200
- "loss": 0.0575,
201
- "step": 10140
202
  },
203
  {
204
  "epoch": 4.0,
205
- "eval_loss": 0.10807737708091736,
206
- "eval_max_distance": 18,
207
- "eval_mean_distance": 2,
208
- "eval_runtime": 0.3927,
209
- "eval_samples_per_second": 127.308,
210
- "eval_steps_per_second": 5.092,
211
- "step": 10392
212
- },
213
- {
214
- "epoch": 4.05,
215
- "learning_rate": 0.0009609956376700025,
216
- "loss": 0.1012,
217
- "step": 10530
218
  },
219
  {
220
  "epoch": 4.2,
221
- "learning_rate": 0.0009554358053203319,
222
- "loss": 0.0526,
223
- "step": 10920
224
- },
225
- {
226
- "epoch": 4.35,
227
- "learning_rate": 0.0009498759729706612,
228
- "loss": 0.0496,
229
- "step": 11310
230
  },
231
  {
232
  "epoch": 4.5,
233
- "learning_rate": 0.0009443161406209905,
234
- "loss": 0.0492,
235
- "step": 11700
236
- },
237
- {
238
- "epoch": 4.65,
239
- "learning_rate": 0.0009387563082713198,
240
- "loss": 0.0483,
241
- "step": 12090
242
  },
243
  {
244
  "epoch": 4.8,
245
- "learning_rate": 0.0009331964759216492,
246
- "loss": 0.0469,
247
- "step": 12480
248
- },
249
- {
250
- "epoch": 4.95,
251
- "learning_rate": 0.0009276366435719784,
252
- "loss": 0.0452,
253
- "step": 12870
254
  },
255
  {
256
  "epoch": 5.0,
257
- "eval_loss": 0.08966636657714844,
258
- "eval_max_distance": 14,
259
  "eval_mean_distance": 1,
260
- "eval_runtime": 0.3879,
261
- "eval_samples_per_second": 128.906,
262
- "eval_steps_per_second": 5.156,
263
- "step": 12990
264
  },
265
  {
266
  "epoch": 5.1,
267
- "learning_rate": 0.0009220768112223078,
268
- "loss": 0.0392,
269
- "step": 13260
270
- },
271
- {
272
- "epoch": 5.25,
273
- "learning_rate": 0.0009165169788726371,
274
- "loss": 0.0397,
275
- "step": 13650
276
  },
277
  {
278
  "epoch": 5.4,
279
- "learning_rate": 0.0009109571465229664,
280
- "loss": 0.0393,
281
- "step": 14040
282
- },
283
- {
284
- "epoch": 5.55,
285
- "learning_rate": 0.0009053973141732957,
286
- "loss": 0.0399,
287
- "step": 14430
288
  },
289
  {
290
  "epoch": 5.7,
291
- "learning_rate": 0.0008998374818236251,
292
- "loss": 0.039,
293
- "step": 14820
294
- },
295
- {
296
- "epoch": 5.85,
297
- "learning_rate": 0.0008942776494739543,
298
- "loss": 0.0372,
299
- "step": 15210
300
  },
301
  {
302
  "epoch": 6.0,
303
- "eval_loss": 0.07197271287441254,
304
- "eval_max_distance": 15,
305
- "eval_mean_distance": 1,
306
- "eval_runtime": 0.3764,
307
- "eval_samples_per_second": 132.832,
308
- "eval_steps_per_second": 5.313,
309
- "step": 15588
310
  },
311
  {
312
  "epoch": 6.0,
313
- "learning_rate": 0.0008887178171242837,
314
- "loss": 0.039,
315
- "step": 15600
316
- },
317
- {
318
- "epoch": 6.15,
319
- "learning_rate": 0.000883157984774613,
320
- "loss": 0.0312,
321
- "step": 15990
322
  },
323
  {
324
  "epoch": 6.3,
325
- "learning_rate": 0.0008775981524249422,
326
- "loss": 0.0337,
327
- "step": 16380
328
- },
329
- {
330
- "epoch": 6.45,
331
- "learning_rate": 0.0008720383200752716,
332
- "loss": 0.0328,
333
- "step": 16770
334
- },
335
- {
336
- "epoch": 6.61,
337
- "learning_rate": 0.0008664784877256009,
338
- "loss": 0.0327,
339
- "step": 17160
340
  },
341
  {
342
- "epoch": 6.76,
343
- "learning_rate": 0.0008609186553759302,
344
- "loss": 0.0305,
345
- "step": 17550
346
  },
347
  {
348
- "epoch": 6.91,
349
- "learning_rate": 0.0008553588230262595,
350
- "loss": 0.0323,
351
- "step": 17940
352
  },
353
  {
354
  "epoch": 7.0,
355
- "eval_loss": 0.08398188650608063,
356
- "eval_max_distance": 12,
357
- "eval_mean_distance": 1,
358
- "eval_runtime": 0.3765,
359
- "eval_samples_per_second": 132.8,
360
- "eval_steps_per_second": 5.312,
361
- "step": 18186
362
- },
363
- {
364
- "epoch": 7.06,
365
- "learning_rate": 0.0008497989906765889,
366
- "loss": 0.0286,
367
- "step": 18330
368
- },
369
- {
370
- "epoch": 7.21,
371
- "learning_rate": 0.0008442391583269181,
372
- "loss": 0.0263,
373
- "step": 18720
374
- },
375
- {
376
- "epoch": 7.36,
377
- "learning_rate": 0.0008386793259772475,
378
- "loss": 0.0269,
379
- "step": 19110
380
- },
381
- {
382
- "epoch": 7.51,
383
- "learning_rate": 0.0008331194936275768,
384
- "loss": 0.0268,
385
- "step": 19500
386
  },
387
  {
388
- "epoch": 7.66,
389
- "learning_rate": 0.0008275596612779061,
390
- "loss": 0.0283,
391
- "step": 19890
392
  },
393
  {
394
- "epoch": 7.81,
395
- "learning_rate": 0.0008219998289282354,
396
- "loss": 0.0274,
397
- "step": 20280
398
  },
399
  {
400
- "epoch": 7.96,
401
- "learning_rate": 0.0008164399965785648,
402
- "loss": 0.0267,
403
- "step": 20670
404
  },
405
  {
406
  "epoch": 8.0,
407
- "eval_loss": 0.07682657241821289,
408
- "eval_max_distance": 16,
409
  "eval_mean_distance": 1,
410
- "eval_runtime": 0.3687,
411
- "eval_samples_per_second": 135.62,
412
- "eval_steps_per_second": 5.425,
413
- "step": 20784
414
- },
415
- {
416
- "epoch": 8.11,
417
- "learning_rate": 0.000810880164228894,
418
- "loss": 0.0235,
419
- "step": 21060
420
  },
421
  {
422
- "epoch": 8.26,
423
- "learning_rate": 0.0008053203318792234,
424
- "loss": 0.0221,
425
- "step": 21450
426
  },
427
  {
428
  "epoch": 8.41,
429
- "learning_rate": 0.0007997604995295527,
430
- "loss": 0.0221,
431
- "step": 21840
432
- },
433
- {
434
- "epoch": 8.56,
435
- "learning_rate": 0.0007942006671798819,
436
- "loss": 0.0223,
437
- "step": 22230
438
  },
439
  {
440
  "epoch": 8.71,
441
- "learning_rate": 0.0007886408348302113,
442
- "loss": 0.0233,
443
- "step": 22620
444
- },
445
- {
446
- "epoch": 8.86,
447
- "learning_rate": 0.0007830810024805405,
448
- "loss": 0.0231,
449
- "step": 23010
450
  },
451
  {
452
  "epoch": 9.0,
453
- "eval_loss": 0.06973634660243988,
454
- "eval_max_distance": 10,
455
  "eval_mean_distance": 1,
456
- "eval_runtime": 0.3759,
457
- "eval_samples_per_second": 133.026,
458
- "eval_steps_per_second": 5.321,
459
- "step": 23382
460
  },
461
  {
462
  "epoch": 9.01,
463
- "learning_rate": 0.0007775211701308699,
464
- "loss": 0.0227,
465
- "step": 23400
466
- },
467
- {
468
- "epoch": 9.16,
469
- "learning_rate": 0.0007719613377811992,
470
- "loss": 0.0185,
471
- "step": 23790
472
  },
473
  {
474
  "epoch": 9.31,
475
- "learning_rate": 0.0007664015054315285,
476
- "loss": 0.0183,
477
- "step": 24180
478
- },
479
- {
480
- "epoch": 9.46,
481
- "learning_rate": 0.0007608416730818578,
482
- "loss": 0.0191,
483
- "step": 24570
484
  },
485
  {
486
  "epoch": 9.61,
487
- "learning_rate": 0.0007552818407321872,
488
- "loss": 0.019,
489
- "step": 24960
490
- },
491
- {
492
- "epoch": 9.76,
493
- "learning_rate": 0.0007497220083825164,
494
- "loss": 0.0193,
495
- "step": 25350
496
  },
497
  {
498
  "epoch": 9.91,
499
- "learning_rate": 0.0007441621760328458,
500
- "loss": 0.0199,
501
- "step": 25740
502
  },
503
  {
504
  "epoch": 10.0,
505
- "eval_loss": 0.07169829308986664,
506
- "eval_max_distance": 9,
507
  "eval_mean_distance": 1,
508
- "eval_runtime": 0.3704,
509
- "eval_samples_per_second": 134.993,
510
- "eval_steps_per_second": 5.4,
511
- "step": 25980
512
- },
513
- {
514
- "epoch": 10.06,
515
- "learning_rate": 0.0007386023436831751,
516
- "loss": 0.0184,
517
- "step": 26130
518
  },
519
  {
520
  "epoch": 10.21,
521
- "learning_rate": 0.0007330425113335044,
522
- "loss": 0.016,
523
- "step": 26520
524
- },
525
- {
526
- "epoch": 10.36,
527
- "learning_rate": 0.0007274826789838337,
528
- "loss": 0.0164,
529
- "step": 26910
530
  },
531
  {
532
  "epoch": 10.51,
533
- "learning_rate": 0.0007219228466341631,
534
- "loss": 0.016,
535
- "step": 27300
536
- },
537
- {
538
- "epoch": 10.66,
539
- "learning_rate": 0.0007163630142844923,
540
- "loss": 0.0169,
541
- "step": 27690
542
  },
543
  {
544
  "epoch": 10.81,
545
- "learning_rate": 0.0007108031819348217,
546
- "loss": 0.0165,
547
- "step": 28080
548
- },
549
- {
550
- "epoch": 10.96,
551
- "learning_rate": 0.000705243349585151,
552
- "loss": 0.0168,
553
- "step": 28470
554
  },
555
  {
556
  "epoch": 11.0,
557
- "eval_loss": 0.08123478293418884,
558
- "eval_max_distance": 16,
559
  "eval_mean_distance": 1,
560
- "eval_runtime": 0.3865,
561
- "eval_samples_per_second": 129.356,
562
- "eval_steps_per_second": 5.174,
563
- "step": 28578
564
  },
565
  {
566
  "epoch": 11.11,
567
- "learning_rate": 0.0006996835172354803,
568
- "loss": 0.015,
569
- "step": 28860
570
- },
571
- {
572
- "epoch": 11.26,
573
- "learning_rate": 0.0006941236848858096,
574
- "loss": 0.0137,
575
- "step": 29250
576
  },
577
  {
578
  "epoch": 11.41,
579
- "learning_rate": 0.0006885638525361389,
580
- "loss": 0.0151,
581
- "step": 29640
582
- },
583
- {
584
- "epoch": 11.56,
585
- "learning_rate": 0.0006830040201864682,
586
- "loss": 0.0144,
587
- "step": 30030
588
  },
589
  {
590
  "epoch": 11.71,
591
- "learning_rate": 0.0006774441878367975,
592
- "loss": 0.0147,
593
- "step": 30420
594
- },
595
- {
596
- "epoch": 11.86,
597
- "learning_rate": 0.0006718843554871269,
598
- "loss": 0.0148,
599
- "step": 30810
600
  },
601
  {
602
  "epoch": 12.0,
603
- "eval_loss": 0.09610763192176819,
604
- "eval_max_distance": 12,
605
  "eval_mean_distance": 1,
606
- "eval_runtime": 0.3633,
607
- "eval_samples_per_second": 137.639,
608
- "eval_steps_per_second": 5.506,
609
- "step": 31176
610
  },
611
  {
612
  "epoch": 12.01,
613
- "learning_rate": 0.0006663245231374561,
614
- "loss": 0.0145,
615
- "step": 31200
616
- },
617
- {
618
- "epoch": 12.16,
619
- "learning_rate": 0.0006607646907877855,
620
- "loss": 0.0124,
621
- "step": 31590
622
  },
623
  {
624
  "epoch": 12.31,
625
- "learning_rate": 0.0006552048584381148,
626
- "loss": 0.0117,
627
- "step": 31980
628
- },
629
- {
630
- "epoch": 12.46,
631
- "learning_rate": 0.0006496450260884441,
632
- "loss": 0.0121,
633
- "step": 32370
634
  },
635
  {
636
  "epoch": 12.61,
637
- "learning_rate": 0.0006440851937387734,
638
- "loss": 0.0124,
639
- "step": 32760
640
- },
641
- {
642
- "epoch": 12.76,
643
- "learning_rate": 0.0006385253613891028,
644
- "loss": 0.0125,
645
- "step": 33150
646
  },
647
  {
648
  "epoch": 12.91,
649
- "learning_rate": 0.000632965529039432,
650
- "loss": 0.0128,
651
- "step": 33540
652
  },
653
  {
654
  "epoch": 13.0,
655
- "eval_loss": 0.08225859701633453,
656
- "eval_max_distance": 9,
657
  "eval_mean_distance": 1,
658
- "eval_runtime": 0.3712,
659
- "eval_samples_per_second": 134.695,
660
- "eval_steps_per_second": 5.388,
661
- "step": 33774
662
- },
663
- {
664
- "epoch": 13.06,
665
- "learning_rate": 0.0006274056966897614,
666
- "loss": 0.0116,
667
- "step": 33930
668
  },
669
  {
670
  "epoch": 13.21,
671
- "learning_rate": 0.0006218458643400907,
672
- "loss": 0.0106,
673
- "step": 34320
674
- },
675
- {
676
- "epoch": 13.36,
677
- "learning_rate": 0.00061628603199042,
678
- "loss": 0.0104,
679
- "step": 34710
680
  },
681
  {
682
  "epoch": 13.51,
683
- "learning_rate": 0.0006107261996407493,
684
- "loss": 0.011,
685
- "step": 35100
686
- },
687
- {
688
- "epoch": 13.66,
689
- "learning_rate": 0.0006051663672910787,
690
- "loss": 0.0108,
691
- "step": 35490
692
  },
693
  {
694
  "epoch": 13.81,
695
- "learning_rate": 0.0005996065349414079,
696
- "loss": 0.0111,
697
- "step": 35880
698
- },
699
- {
700
- "epoch": 13.96,
701
- "learning_rate": 0.0005940467025917372,
702
- "loss": 0.0112,
703
- "step": 36270
704
  },
705
  {
706
  "epoch": 14.0,
707
- "eval_loss": 0.07655028253793716,
708
- "eval_max_distance": 12,
709
  "eval_mean_distance": 1,
710
- "eval_runtime": 0.361,
711
- "eval_samples_per_second": 138.506,
712
- "eval_steps_per_second": 5.54,
713
- "step": 36372
714
  },
715
  {
716
  "epoch": 14.11,
717
- "learning_rate": 0.0005884868702420666,
718
- "loss": 0.0098,
719
- "step": 36660
720
- },
721
- {
722
- "epoch": 14.26,
723
- "learning_rate": 0.0005829270378923958,
724
- "loss": 0.009,
725
- "step": 37050
726
  },
727
  {
728
  "epoch": 14.41,
729
- "learning_rate": 0.0005773672055427252,
730
- "loss": 0.0093,
731
- "step": 37440
732
- },
733
- {
734
- "epoch": 14.56,
735
- "learning_rate": 0.0005718073731930545,
736
- "loss": 0.0095,
737
- "step": 37830
738
  },
739
  {
740
  "epoch": 14.71,
741
- "learning_rate": 0.0005662475408433838,
742
- "loss": 0.0093,
743
- "step": 38220
744
- },
745
- {
746
- "epoch": 14.86,
747
- "learning_rate": 0.0005606877084937131,
748
- "loss": 0.0093,
749
- "step": 38610
750
  },
751
  {
752
  "epoch": 15.0,
753
- "eval_loss": 0.07127052545547485,
754
- "eval_max_distance": 9,
755
  "eval_mean_distance": 1,
756
- "eval_runtime": 0.3671,
757
- "eval_samples_per_second": 136.219,
758
- "eval_steps_per_second": 5.449,
759
- "step": 38970
760
  },
761
  {
762
  "epoch": 15.01,
763
- "learning_rate": 0.0005551278761440425,
764
- "loss": 0.0101,
765
- "step": 39000
766
- },
767
- {
768
- "epoch": 15.16,
769
- "learning_rate": 0.0005495680437943717,
770
- "loss": 0.0078,
771
- "step": 39390
772
  },
773
  {
774
  "epoch": 15.31,
775
- "learning_rate": 0.0005440082114447011,
776
- "loss": 0.0079,
777
- "step": 39780
778
- },
779
- {
780
- "epoch": 15.46,
781
- "learning_rate": 0.0005384483790950304,
782
- "loss": 0.0081,
783
- "step": 40170
784
  },
785
  {
786
  "epoch": 15.61,
787
- "learning_rate": 0.0005328885467453597,
788
- "loss": 0.0085,
789
- "step": 40560
790
- },
791
- {
792
- "epoch": 15.76,
793
- "learning_rate": 0.000527328714395689,
794
- "loss": 0.0088,
795
- "step": 40950
796
  },
797
  {
798
  "epoch": 15.91,
799
- "learning_rate": 0.0005217688820460184,
800
- "loss": 0.0083,
801
- "step": 41340
802
  },
803
  {
804
  "epoch": 16.0,
805
- "eval_loss": 0.08469703793525696,
806
- "eval_max_distance": 14,
807
  "eval_mean_distance": 1,
808
- "eval_runtime": 0.3815,
809
- "eval_samples_per_second": 131.073,
810
- "eval_steps_per_second": 5.243,
811
- "step": 41568
812
- },
813
- {
814
- "epoch": 16.06,
815
- "learning_rate": 0.0005162090496963476,
816
- "loss": 0.0081,
817
- "step": 41730
818
  },
819
  {
820
  "epoch": 16.21,
821
- "learning_rate": 0.000510649217346677,
822
- "loss": 0.0069,
823
- "step": 42120
824
- },
825
- {
826
- "epoch": 16.36,
827
- "learning_rate": 0.0005050893849970063,
828
- "loss": 0.007,
829
- "step": 42510
830
  },
831
  {
832
  "epoch": 16.51,
833
- "learning_rate": 0.0004995295526473355,
834
- "loss": 0.0071,
835
- "step": 42900
836
- },
837
- {
838
- "epoch": 16.66,
839
- "learning_rate": 0.0004939697202976649,
840
- "loss": 0.0073,
841
- "step": 43290
842
  },
843
  {
844
  "epoch": 16.81,
845
- "learning_rate": 0.0004884098879479942,
846
- "loss": 0.0076,
847
- "step": 43680
848
- },
849
- {
850
- "epoch": 16.96,
851
- "learning_rate": 0.0004828500555983235,
852
- "loss": 0.0076,
853
- "step": 44070
854
  },
855
  {
856
  "epoch": 17.0,
857
- "eval_loss": 0.08625645935535431,
858
- "eval_max_distance": 11,
859
  "eval_mean_distance": 1,
860
- "eval_runtime": 0.3551,
861
- "eval_samples_per_second": 140.8,
862
- "eval_steps_per_second": 5.632,
863
- "step": 44166
864
  },
865
  {
866
  "epoch": 17.11,
867
- "learning_rate": 0.00047729022324865286,
868
- "loss": 0.0064,
869
- "step": 44460
870
- },
871
- {
872
- "epoch": 17.26,
873
- "learning_rate": 0.00047173039089898214,
874
- "loss": 0.0059,
875
- "step": 44850
876
  },
877
  {
878
  "epoch": 17.41,
879
- "learning_rate": 0.0004661705585493115,
880
- "loss": 0.0064,
881
- "step": 45240
882
- },
883
- {
884
- "epoch": 17.56,
885
- "learning_rate": 0.0004606107261996408,
886
- "loss": 0.0068,
887
- "step": 45630
888
  },
889
  {
890
  "epoch": 17.71,
891
- "learning_rate": 0.00045505089384997004,
892
- "loss": 0.0066,
893
- "step": 46020
894
- },
895
- {
896
- "epoch": 17.86,
897
- "learning_rate": 0.00044949106150029937,
898
- "loss": 0.0064,
899
- "step": 46410
900
  },
901
  {
902
  "epoch": 18.0,
903
- "eval_loss": 0.08296500891447067,
904
- "eval_max_distance": 14,
905
  "eval_mean_distance": 1,
906
- "eval_runtime": 0.3721,
907
- "eval_samples_per_second": 134.372,
908
- "eval_steps_per_second": 5.375,
909
- "step": 46764
910
  },
911
  {
912
  "epoch": 18.01,
913
- "learning_rate": 0.0004439312291506287,
914
- "loss": 0.0065,
915
- "step": 46800
916
- },
917
- {
918
- "epoch": 18.16,
919
- "learning_rate": 0.000438371396800958,
920
- "loss": 0.0055,
921
- "step": 47190
922
  },
923
  {
924
  "epoch": 18.31,
925
- "learning_rate": 0.0004328115644512873,
926
- "loss": 0.0052,
927
- "step": 47580
928
- },
929
- {
930
- "epoch": 18.46,
931
- "learning_rate": 0.00042725173210161665,
932
- "loss": 0.0057,
933
- "step": 47970
934
  },
935
  {
936
  "epoch": 18.61,
937
- "learning_rate": 0.00042169189975194593,
938
- "loss": 0.006,
939
- "step": 48360
940
- },
941
- {
942
- "epoch": 18.76,
943
- "learning_rate": 0.00041613206740227527,
944
- "loss": 0.0055,
945
- "step": 48750
946
  },
947
  {
948
  "epoch": 18.91,
949
- "learning_rate": 0.0004105722350526046,
950
- "loss": 0.0054,
951
- "step": 49140
952
  },
953
  {
954
  "epoch": 19.0,
955
- "eval_loss": 0.08839410543441772,
956
- "eval_max_distance": 11,
957
  "eval_mean_distance": 1,
958
- "eval_runtime": 0.367,
959
- "eval_samples_per_second": 136.245,
960
- "eval_steps_per_second": 5.45,
961
- "step": 49362
962
- },
963
- {
964
- "epoch": 19.06,
965
- "learning_rate": 0.0004050124027029339,
966
- "loss": 0.0057,
967
- "step": 49530
968
  },
969
  {
970
  "epoch": 19.21,
971
- "learning_rate": 0.0003994525703532632,
972
- "loss": 0.0047,
973
- "step": 49920
974
  },
975
  {
976
- "epoch": 19.36,
977
- "learning_rate": 0.0003938927380035925,
978
- "loss": 0.0048,
979
- "step": 50310
980
- },
981
- {
982
- "epoch": 19.52,
983
- "learning_rate": 0.00038833290565392183,
984
- "loss": 0.0052,
985
- "step": 50700
986
  },
987
  {
988
- "epoch": 19.67,
989
- "learning_rate": 0.00038277307330425117,
990
- "loss": 0.005,
991
- "step": 51090
992
  },
993
  {
994
- "epoch": 19.82,
995
- "learning_rate": 0.00037721324095458045,
996
- "loss": 0.0048,
997
- "step": 51480
 
 
 
 
998
  },
999
  {
1000
- "epoch": 19.97,
1001
- "learning_rate": 0.0003716534086049098,
1002
- "loss": 0.0052,
1003
- "step": 51870
1004
  },
1005
  {
1006
- "epoch": 20.0,
1007
- "eval_loss": 0.08214738219976425,
1008
- "eval_max_distance": 10,
1009
- "eval_mean_distance": 1,
1010
- "eval_runtime": 0.3692,
1011
- "eval_samples_per_second": 135.434,
1012
- "eval_steps_per_second": 5.417,
1013
- "step": 51960
1014
  },
1015
  {
1016
- "epoch": 20.12,
1017
- "learning_rate": 0.0003660935762552391,
1018
- "loss": 0.0049,
1019
- "step": 52260
1020
  },
1021
  {
1022
- "epoch": 20.27,
1023
- "learning_rate": 0.00036053374390556834,
1024
- "loss": 0.0043,
1025
- "step": 52650
 
 
 
 
1026
  },
1027
  {
1028
- "epoch": 20.42,
1029
- "learning_rate": 0.0003549739115558977,
1030
- "loss": 0.0043,
1031
- "step": 53040
1032
  },
1033
  {
1034
- "epoch": 20.57,
1035
- "learning_rate": 0.000349414079206227,
1036
- "loss": 0.0044,
1037
- "step": 53430
1038
  },
1039
  {
1040
- "epoch": 20.72,
1041
- "learning_rate": 0.0003438542468565563,
1042
- "loss": 0.0044,
1043
- "step": 53820
1044
  },
1045
  {
1046
- "epoch": 20.87,
1047
- "learning_rate": 0.0003382944145068856,
1048
- "loss": 0.0045,
1049
- "step": 54210
1050
  },
1051
  {
1052
- "epoch": 21.0,
1053
- "eval_loss": 0.0914614275097847,
1054
- "eval_max_distance": 14,
1055
  "eval_mean_distance": 1,
1056
- "eval_runtime": 0.3653,
1057
- "eval_samples_per_second": 136.874,
1058
- "eval_steps_per_second": 5.475,
1059
- "step": 54558
1060
  },
1061
  {
1062
- "epoch": 21.02,
1063
- "learning_rate": 0.00033273458215721496,
1064
- "loss": 0.0041,
1065
- "step": 54600
1066
  },
1067
  {
1068
- "epoch": 21.17,
1069
- "learning_rate": 0.00032717474980754424,
1070
- "loss": 0.0035,
1071
- "step": 54990
1072
  },
1073
  {
1074
- "epoch": 21.32,
1075
- "learning_rate": 0.0003216149174578736,
1076
- "loss": 0.0038,
1077
- "step": 55380
1078
  },
1079
  {
1080
- "epoch": 21.47,
1081
- "learning_rate": 0.0003160550851082029,
1082
- "loss": 0.0038,
1083
- "step": 55770
 
 
 
 
1084
  },
1085
  {
1086
- "epoch": 21.62,
1087
- "learning_rate": 0.0003104952527585322,
1088
- "loss": 0.0041,
1089
- "step": 56160
1090
  },
1091
  {
1092
- "epoch": 21.77,
1093
- "learning_rate": 0.0003049354204088615,
1094
- "loss": 0.004,
1095
- "step": 56550
1096
  },
1097
  {
1098
- "epoch": 21.92,
1099
- "learning_rate": 0.00029937558805919086,
1100
- "loss": 0.0037,
1101
- "step": 56940
1102
  },
1103
  {
1104
- "epoch": 22.0,
1105
- "eval_loss": 0.09314610809087753,
1106
- "eval_max_distance": 14,
1107
  "eval_mean_distance": 1,
1108
- "eval_runtime": 0.3634,
1109
- "eval_samples_per_second": 137.604,
1110
- "eval_steps_per_second": 5.504,
1111
- "step": 57156
1112
  },
1113
  {
1114
- "epoch": 22.07,
1115
- "learning_rate": 0.00029381575570952014,
1116
- "loss": 0.0037,
1117
- "step": 57330
1118
  },
1119
  {
1120
- "epoch": 22.22,
1121
- "learning_rate": 0.0002882559233598495,
1122
- "loss": 0.0033,
1123
- "step": 57720
1124
  },
1125
  {
1126
- "epoch": 22.37,
1127
- "learning_rate": 0.0002826960910101788,
1128
- "loss": 0.0034,
1129
- "step": 58110
1130
  },
1131
  {
1132
- "epoch": 22.52,
1133
- "learning_rate": 0.0002771362586605081,
1134
- "loss": 0.0034,
1135
- "step": 58500
1136
  },
1137
  {
1138
- "epoch": 22.67,
1139
- "learning_rate": 0.0002715764263108374,
1140
- "loss": 0.0035,
1141
- "step": 58890
 
 
 
 
1142
  },
1143
  {
1144
- "epoch": 22.82,
1145
- "learning_rate": 0.0002660165939611667,
1146
- "loss": 0.0034,
1147
- "step": 59280
1148
  },
1149
  {
1150
- "epoch": 22.97,
1151
- "learning_rate": 0.000260456761611496,
1152
- "loss": 0.0036,
1153
- "step": 59670
1154
  },
1155
  {
1156
- "epoch": 23.0,
1157
- "eval_loss": 0.09405915439128876,
1158
- "eval_max_distance": 9,
 
 
 
 
 
 
1159
  "eval_mean_distance": 1,
1160
- "eval_runtime": 0.3715,
1161
- "eval_samples_per_second": 134.573,
1162
- "eval_steps_per_second": 5.383,
1163
- "step": 59754
1164
  },
1165
  {
1166
- "epoch": 23.12,
1167
- "learning_rate": 0.0002548969292618253,
1168
- "loss": 0.003,
1169
- "step": 60060
1170
  },
1171
  {
1172
- "epoch": 23.27,
1173
- "learning_rate": 0.00024933709691215465,
1174
- "loss": 0.0031,
1175
- "step": 60450
1176
  },
1177
  {
1178
- "epoch": 23.42,
1179
- "learning_rate": 0.00024377726456248396,
1180
- "loss": 0.003,
1181
- "step": 60840
1182
  },
1183
  {
1184
- "epoch": 23.57,
1185
- "learning_rate": 0.00023821743221281327,
1186
- "loss": 0.0029,
1187
- "step": 61230
 
 
 
 
1188
  },
1189
  {
1190
- "epoch": 23.72,
1191
- "learning_rate": 0.00023265759986314258,
1192
- "loss": 0.0028,
1193
- "step": 61620
1194
  },
1195
  {
1196
- "epoch": 23.87,
1197
- "learning_rate": 0.0002270977675134719,
1198
- "loss": 0.0028,
1199
- "step": 62010
1200
  },
1201
  {
1202
- "epoch": 24.0,
1203
- "eval_loss": 0.08611776679754257,
1204
- "eval_max_distance": 13,
 
 
 
 
 
 
 
 
 
 
 
 
1205
  "eval_mean_distance": 1,
1206
- "eval_runtime": 0.3594,
1207
- "eval_samples_per_second": 139.139,
1208
- "eval_steps_per_second": 5.566,
1209
- "step": 62352
1210
  },
1211
  {
1212
- "epoch": 24.02,
1213
- "learning_rate": 0.00022153793516380122,
1214
- "loss": 0.0027,
1215
- "step": 62400
1216
  },
1217
  {
1218
- "epoch": 24.17,
1219
- "learning_rate": 0.00021597810281413053,
1220
- "loss": 0.0026,
1221
- "step": 62790
1222
  },
1223
  {
1224
- "epoch": 24.32,
1225
- "learning_rate": 0.00021041827046445986,
1226
- "loss": 0.0027,
1227
- "step": 63180
1228
  },
1229
  {
1230
- "epoch": 24.47,
1231
- "learning_rate": 0.00020485843811478917,
1232
- "loss": 0.0027,
1233
- "step": 63570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1234
  },
1235
  {
1236
- "epoch": 24.62,
1237
- "learning_rate": 0.00019929860576511847,
1238
  "loss": 0.0027,
1239
- "step": 63960
1240
  },
1241
  {
1242
- "epoch": 24.77,
1243
- "learning_rate": 0.00019373877341544778,
1244
- "loss": 0.0024,
1245
- "step": 64350
1246
  },
1247
  {
1248
- "epoch": 24.92,
1249
- "learning_rate": 0.0001881789410657771,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1250
  "loss": 0.0026,
1251
- "step": 64740
1252
  },
1253
  {
1254
- "epoch": 25.0,
1255
- "eval_loss": 0.09115344285964966,
1256
- "eval_max_distance": 12,
 
 
 
 
 
 
 
 
 
 
 
 
1257
  "eval_mean_distance": 1,
1258
- "eval_runtime": 0.3622,
1259
- "eval_samples_per_second": 138.045,
1260
- "eval_steps_per_second": 5.522,
1261
- "step": 64950
1262
  },
1263
  {
1264
- "epoch": 25.07,
1265
- "learning_rate": 0.0001826191087161064,
1266
- "loss": 0.0026,
1267
- "step": 65130
1268
  },
1269
  {
1270
- "epoch": 25.22,
1271
- "learning_rate": 0.00017705927636643573,
1272
  "loss": 0.0023,
1273
- "step": 65520
1274
  },
1275
  {
1276
- "epoch": 25.37,
1277
- "learning_rate": 0.00017149944401676504,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1278
  "loss": 0.0023,
1279
- "step": 65910
1280
  },
1281
  {
1282
- "epoch": 25.52,
1283
- "learning_rate": 0.00016593961166709435,
1284
  "loss": 0.0021,
1285
- "step": 66300
1286
  },
1287
  {
1288
- "epoch": 25.67,
1289
- "learning_rate": 0.00016037977931742368,
1290
- "loss": 0.0021,
1291
- "step": 66690
1292
  },
1293
  {
1294
- "epoch": 25.82,
1295
- "learning_rate": 0.000154819946967753,
1296
- "loss": 0.0024,
1297
- "step": 67080
 
 
 
 
1298
  },
1299
  {
1300
- "epoch": 25.97,
1301
- "learning_rate": 0.0001492601146180823,
1302
- "loss": 0.0024,
1303
- "step": 67470
1304
  },
1305
  {
1306
- "epoch": 26.0,
1307
- "eval_loss": 0.09158334881067276,
1308
- "eval_max_distance": 9,
1309
- "eval_mean_distance": 0,
1310
- "eval_runtime": 0.3618,
1311
- "eval_samples_per_second": 138.208,
1312
- "eval_steps_per_second": 5.528,
1313
- "step": 67548
1314
  },
1315
  {
1316
- "epoch": 26.12,
1317
- "learning_rate": 0.0001437002822684116,
1318
  "loss": 0.0021,
1319
- "step": 67860
1320
  },
1321
  {
1322
- "epoch": 26.27,
1323
- "learning_rate": 0.0001381404499187409,
1324
- "loss": 0.0019,
1325
- "step": 68250
1326
  },
1327
  {
1328
- "epoch": 26.42,
1329
- "learning_rate": 0.00013258061756907022,
 
 
 
 
 
 
 
 
 
 
1330
  "loss": 0.002,
1331
- "step": 68640
1332
  },
1333
  {
1334
- "epoch": 26.57,
1335
- "learning_rate": 0.00012702078521939955,
1336
  "loss": 0.0019,
1337
- "step": 69030
1338
  },
1339
  {
1340
- "epoch": 26.72,
1341
- "learning_rate": 0.00012146095286972886,
1342
- "loss": 0.0021,
1343
- "step": 69420
1344
  },
1345
  {
1346
- "epoch": 26.87,
1347
- "learning_rate": 0.00011590112052005817,
1348
- "loss": 0.002,
1349
- "step": 69810
 
 
 
 
1350
  },
1351
  {
1352
- "epoch": 27.0,
1353
- "eval_loss": 0.08878373354673386,
1354
- "eval_max_distance": 9,
1355
- "eval_mean_distance": 0,
1356
- "eval_runtime": 0.3454,
1357
- "eval_samples_per_second": 144.754,
1358
- "eval_steps_per_second": 5.79,
1359
- "step": 70146
1360
  },
1361
  {
1362
- "epoch": 27.02,
1363
- "learning_rate": 0.00011034128817038747,
1364
- "loss": 0.0021,
1365
- "step": 70200
1366
  },
1367
  {
1368
- "epoch": 27.17,
1369
- "learning_rate": 0.0001047814558207168,
1370
  "loss": 0.0017,
1371
- "step": 70590
1372
  },
1373
  {
1374
- "epoch": 27.32,
1375
- "learning_rate": 9.92216234710461e-05,
1376
- "loss": 0.0018,
1377
- "step": 70980
 
 
 
 
1378
  },
1379
  {
1380
- "epoch": 27.47,
1381
- "learning_rate": 9.366179112137542e-05,
1382
- "loss": 0.0017,
1383
- "step": 71370
1384
  },
1385
  {
1386
- "epoch": 27.62,
1387
- "learning_rate": 8.810195877170473e-05,
1388
  "loss": 0.0016,
1389
- "step": 71760
1390
  },
1391
  {
1392
- "epoch": 27.77,
1393
- "learning_rate": 8.254212642203404e-05,
1394
- "loss": 0.002,
1395
- "step": 72150
1396
  },
1397
  {
1398
- "epoch": 27.92,
1399
- "learning_rate": 7.698229407236336e-05,
1400
  "loss": 0.0017,
1401
- "step": 72540
1402
  },
1403
  {
1404
- "epoch": 28.0,
1405
- "eval_loss": 0.08879587054252625,
1406
- "eval_max_distance": 9,
1407
  "eval_mean_distance": 0,
1408
- "eval_runtime": 0.3476,
1409
- "eval_samples_per_second": 143.846,
1410
- "eval_steps_per_second": 5.754,
1411
- "step": 72744
1412
  },
1413
  {
1414
- "epoch": 28.07,
1415
- "learning_rate": 7.142246172269268e-05,
1416
- "loss": 0.0016,
1417
- "step": 72930
1418
  },
1419
  {
1420
- "epoch": 28.22,
1421
- "learning_rate": 6.586262937302199e-05,
1422
  "loss": 0.0016,
1423
- "step": 73320
1424
  },
1425
  {
1426
- "epoch": 28.37,
1427
- "learning_rate": 6.03027970233513e-05,
1428
  "loss": 0.0016,
1429
- "step": 73710
1430
  },
1431
  {
1432
- "epoch": 28.52,
1433
- "learning_rate": 5.474296467368061e-05,
1434
- "loss": 0.0016,
1435
- "step": 74100
 
 
 
 
1436
  },
1437
  {
1438
- "epoch": 28.67,
1439
- "learning_rate": 4.9183132324009924e-05,
1440
- "loss": 0.0016,
1441
- "step": 74490
1442
  },
1443
  {
1444
- "epoch": 28.82,
1445
- "learning_rate": 4.362329997433924e-05,
1446
- "loss": 0.0017,
1447
- "step": 74880
1448
  },
1449
  {
1450
- "epoch": 28.97,
1451
- "learning_rate": 3.806346762466855e-05,
1452
- "loss": 0.0017,
1453
- "step": 75270
1454
  },
1455
  {
1456
- "epoch": 29.0,
1457
- "eval_loss": 0.09515639394521713,
1458
- "eval_max_distance": 9,
1459
- "eval_mean_distance": 0,
1460
- "eval_runtime": 0.343,
1461
- "eval_samples_per_second": 145.752,
1462
- "eval_steps_per_second": 5.83,
1463
- "step": 75342
1464
  },
1465
  {
1466
- "epoch": 29.12,
1467
- "learning_rate": 3.250363527499786e-05,
 
 
 
 
 
 
1468
  "loss": 0.0015,
1469
- "step": 75660
1470
  },
1471
  {
1472
- "epoch": 29.27,
1473
- "learning_rate": 2.6943802925327177e-05,
1474
- "loss": 0.0016,
1475
- "step": 76050
1476
  },
1477
  {
1478
- "epoch": 29.42,
1479
- "learning_rate": 2.1383970575656488e-05,
1480
  "loss": 0.0014,
1481
- "step": 76440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1482
  },
1483
  {
1484
- "epoch": 29.57,
1485
- "learning_rate": 1.5824138225985802e-05,
1486
  "loss": 0.0013,
1487
- "step": 76830
1488
  },
1489
  {
1490
- "epoch": 29.72,
1491
- "learning_rate": 1.0264305876315115e-05,
1492
- "loss": 0.0014,
1493
- "step": 77220
1494
  },
1495
  {
1496
- "epoch": 29.87,
1497
- "learning_rate": 4.704473526644427e-06,
1498
- "loss": 0.0014,
1499
- "step": 77610
 
 
 
 
1500
  },
1501
  {
1502
- "epoch": 30.0,
1503
- "eval_loss": 0.09847646951675415,
1504
- "eval_max_distance": 9,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1505
  "eval_mean_distance": 0,
1506
- "eval_runtime": 0.3435,
1507
- "eval_samples_per_second": 145.564,
1508
- "eval_steps_per_second": 5.823,
1509
- "step": 77940
1510
  },
1511
  {
1512
- "epoch": 30.0,
1513
- "step": 77940,
1514
- "total_flos": 4.517674593940685e+16,
1515
- "train_loss": 0.053724035134690526,
1516
- "train_runtime": 6582.4117,
1517
- "train_samples_per_second": 355.137,
1518
- "train_steps_per_second": 11.841
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1519
  }
1520
  ],
1521
- "logging_steps": 390,
1522
- "max_steps": 77940,
1523
- "num_train_epochs": 30,
1524
- "save_steps": 780,
1525
- "total_flos": 4.517674593940685e+16,
1526
  "trial_name": null,
1527
  "trial_params": null
1528
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 60.0,
5
  "eval_steps": 500,
6
+ "global_step": 197880,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 5.0535678188801295e-08,
14
+ "loss": 12.7149,
15
  "step": 1
16
  },
 
 
 
 
 
 
17
  {
18
  "epoch": 0.3,
19
+ "learning_rate": 5.0030321406913285e-05,
20
+ "loss": 3.3584,
21
+ "step": 990
 
 
 
 
 
 
22
  },
23
  {
24
  "epoch": 0.6,
25
+ "learning_rate": 0.00010006064281382657,
26
+ "loss": 0.3384,
27
+ "step": 1980
 
 
 
 
 
 
28
  },
29
  {
30
  "epoch": 0.9,
31
+ "learning_rate": 0.00015009096422073984,
32
+ "loss": 0.2236,
33
+ "step": 2970
34
  },
35
  {
36
  "epoch": 1.0,
37
+ "eval_loss": 0.11203870922327042,
38
+ "eval_max_distance": 133,
39
+ "eval_mean_distance": 5,
40
+ "eval_runtime": 0.5965,
41
+ "eval_samples_per_second": 83.828,
42
+ "eval_steps_per_second": 3.353,
43
+ "step": 3298
 
 
 
 
 
 
44
  },
45
  {
46
  "epoch": 1.2,
47
+ "learning_rate": 0.00020012128562765314,
48
+ "loss": 0.1679,
49
+ "step": 3960
 
 
 
 
 
 
50
  },
51
  {
52
  "epoch": 1.5,
53
+ "learning_rate": 0.0002501516070345664,
54
+ "loss": 0.1395,
55
+ "step": 4950
 
 
 
 
 
 
56
  },
57
  {
58
  "epoch": 1.8,
59
+ "learning_rate": 0.0003001819284414797,
60
+ "loss": 0.1179,
61
+ "step": 5940
 
 
 
 
 
 
62
  },
63
  {
64
  "epoch": 2.0,
65
+ "eval_loss": 0.05475025996565819,
66
+ "eval_max_distance": 82,
67
+ "eval_mean_distance": 3,
68
+ "eval_runtime": 0.5422,
69
+ "eval_samples_per_second": 92.223,
70
+ "eval_steps_per_second": 3.689,
71
+ "step": 6596
72
  },
73
  {
74
  "epoch": 2.1,
75
+ "learning_rate": 0.0003502122498483929,
76
+ "loss": 0.1022,
77
+ "step": 6930
 
 
 
 
 
 
78
  },
79
  {
80
  "epoch": 2.4,
81
+ "learning_rate": 0.0004002425712553063,
82
+ "loss": 0.0917,
83
+ "step": 7920
 
 
 
 
 
 
84
  },
85
  {
86
  "epoch": 2.7,
87
+ "learning_rate": 0.0004502728926622195,
88
+ "loss": 0.0829,
89
+ "step": 8910
 
 
 
 
 
 
90
  },
91
  {
92
  "epoch": 3.0,
93
+ "eval_loss": 0.042510777711868286,
94
+ "eval_max_distance": 46,
95
+ "eval_mean_distance": 1,
96
+ "eval_runtime": 0.5158,
97
+ "eval_samples_per_second": 96.928,
98
+ "eval_steps_per_second": 3.877,
99
+ "step": 9894
100
  },
101
  {
102
  "epoch": 3.0,
103
+ "learning_rate": 0.0005003032140691328,
104
+ "loss": 0.0769,
105
+ "step": 9900
 
 
 
 
 
 
106
  },
107
  {
108
  "epoch": 3.3,
109
+ "learning_rate": 0.0005503335354760462,
110
+ "loss": 0.0667,
111
+ "step": 10890
 
 
 
 
 
 
112
  },
113
  {
114
  "epoch": 3.6,
115
+ "learning_rate": 0.0006003638568829594,
116
  "loss": 0.0653,
117
+ "step": 11880
 
 
 
 
 
 
118
  },
119
  {
120
  "epoch": 3.9,
121
+ "learning_rate": 0.0006503941782898727,
122
+ "loss": 0.0643,
123
+ "step": 12870
124
  },
125
  {
126
  "epoch": 4.0,
127
+ "eval_loss": 0.03110930137336254,
128
+ "eval_max_distance": 64,
129
+ "eval_mean_distance": 1,
130
+ "eval_runtime": 0.4848,
131
+ "eval_samples_per_second": 103.129,
132
+ "eval_steps_per_second": 4.125,
133
+ "step": 13192
 
 
 
 
 
 
134
  },
135
  {
136
  "epoch": 4.2,
137
+ "learning_rate": 0.0007004244996967858,
138
+ "loss": 0.0589,
139
+ "step": 13860
 
 
 
 
 
 
140
  },
141
  {
142
  "epoch": 4.5,
143
+ "learning_rate": 0.0007504548211036993,
144
+ "loss": 0.0549,
145
+ "step": 14850
 
 
 
 
 
 
146
  },
147
  {
148
  "epoch": 4.8,
149
+ "learning_rate": 0.0008004851425106126,
150
+ "loss": 0.0538,
151
+ "step": 15840
 
 
 
 
 
 
152
  },
153
  {
154
  "epoch": 5.0,
155
+ "eval_loss": 0.026651622727513313,
156
+ "eval_max_distance": 48,
157
  "eval_mean_distance": 1,
158
+ "eval_runtime": 0.5057,
159
+ "eval_samples_per_second": 98.878,
160
+ "eval_steps_per_second": 3.955,
161
+ "step": 16490
162
  },
163
  {
164
  "epoch": 5.1,
165
+ "learning_rate": 0.0008505154639175257,
166
+ "loss": 0.048,
167
+ "step": 16830
 
 
 
 
 
 
168
  },
169
  {
170
  "epoch": 5.4,
171
+ "learning_rate": 0.000900545785324439,
172
+ "loss": 0.0461,
173
+ "step": 17820
 
 
 
 
 
 
174
  },
175
  {
176
  "epoch": 5.7,
177
+ "learning_rate": 0.0009505761067313523,
178
+ "loss": 0.0469,
179
+ "step": 18810
 
 
 
 
 
 
180
  },
181
  {
182
  "epoch": 6.0,
183
+ "eval_loss": 0.039574604481458664,
184
+ "eval_max_distance": 80,
185
+ "eval_mean_distance": 2,
186
+ "eval_runtime": 0.5179,
187
+ "eval_samples_per_second": 96.548,
188
+ "eval_steps_per_second": 3.862,
189
+ "step": 19788
190
  },
191
  {
192
  "epoch": 6.0,
193
+ "learning_rate": 0.0009999326190957482,
194
+ "loss": 0.0464,
195
+ "step": 19800
 
 
 
 
 
 
196
  },
197
  {
198
  "epoch": 6.3,
199
+ "learning_rate": 0.0009943736944949802,
200
+ "loss": 0.0393,
201
+ "step": 20790
 
 
 
 
 
 
 
 
 
 
 
 
202
  },
203
  {
204
+ "epoch": 6.6,
205
+ "learning_rate": 0.000988814769894212,
206
+ "loss": 0.0426,
207
+ "step": 21780
208
  },
209
  {
210
+ "epoch": 6.9,
211
+ "learning_rate": 0.000983255845293444,
212
+ "loss": 0.0385,
213
+ "step": 22770
214
  },
215
  {
216
  "epoch": 7.0,
217
+ "eval_loss": 0.026188833639025688,
218
+ "eval_max_distance": 73,
219
+ "eval_mean_distance": 2,
220
+ "eval_runtime": 0.4896,
221
+ "eval_samples_per_second": 102.115,
222
+ "eval_steps_per_second": 4.085,
223
+ "step": 23086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  },
225
  {
226
+ "epoch": 7.2,
227
+ "learning_rate": 0.0009776969206926756,
228
+ "loss": 0.034,
229
+ "step": 23760
230
  },
231
  {
232
+ "epoch": 7.5,
233
+ "learning_rate": 0.0009721379960919076,
234
+ "loss": 0.0315,
235
+ "step": 24750
236
  },
237
  {
238
+ "epoch": 7.8,
239
+ "learning_rate": 0.0009665790714911395,
240
+ "loss": 0.0316,
241
+ "step": 25740
242
  },
243
  {
244
  "epoch": 8.0,
245
+ "eval_loss": 0.02234221063554287,
246
+ "eval_max_distance": 40,
247
  "eval_mean_distance": 1,
248
+ "eval_runtime": 0.4837,
249
+ "eval_samples_per_second": 103.365,
250
+ "eval_steps_per_second": 4.135,
251
+ "step": 26384
 
 
 
 
 
 
252
  },
253
  {
254
+ "epoch": 8.1,
255
+ "learning_rate": 0.0009610201468903713,
256
+ "loss": 0.0305,
257
+ "step": 26730
258
  },
259
  {
260
  "epoch": 8.41,
261
+ "learning_rate": 0.0009554612222896032,
262
+ "loss": 0.0271,
263
+ "step": 27720
 
 
 
 
 
 
264
  },
265
  {
266
  "epoch": 8.71,
267
+ "learning_rate": 0.0009499022976888349,
268
+ "loss": 0.0263,
269
+ "step": 28710
 
 
 
 
 
 
270
  },
271
  {
272
  "epoch": 9.0,
273
+ "eval_loss": 0.023996921256184578,
274
+ "eval_max_distance": 69,
275
  "eval_mean_distance": 1,
276
+ "eval_runtime": 0.4894,
277
+ "eval_samples_per_second": 102.167,
278
+ "eval_steps_per_second": 4.087,
279
+ "step": 29682
280
  },
281
  {
282
  "epoch": 9.01,
283
+ "learning_rate": 0.0009443433730880669,
284
+ "loss": 0.0282,
285
+ "step": 29700
 
 
 
 
 
 
286
  },
287
  {
288
  "epoch": 9.31,
289
+ "learning_rate": 0.0009387844484872987,
290
+ "loss": 0.0229,
291
+ "step": 30690
 
 
 
 
 
 
292
  },
293
  {
294
  "epoch": 9.61,
295
+ "learning_rate": 0.0009332255238865306,
296
+ "loss": 0.0226,
297
+ "step": 31680
 
 
 
 
 
 
298
  },
299
  {
300
  "epoch": 9.91,
301
+ "learning_rate": 0.0009276665992857625,
302
+ "loss": 0.0226,
303
+ "step": 32670
304
  },
305
  {
306
  "epoch": 10.0,
307
+ "eval_loss": 0.02030733972787857,
308
+ "eval_max_distance": 60,
309
  "eval_mean_distance": 1,
310
+ "eval_runtime": 0.4797,
311
+ "eval_samples_per_second": 104.236,
312
+ "eval_steps_per_second": 4.169,
313
+ "step": 32980
 
 
 
 
 
 
314
  },
315
  {
316
  "epoch": 10.21,
317
+ "learning_rate": 0.0009221076746849943,
318
+ "loss": 0.0209,
319
+ "step": 33660
 
 
 
 
 
 
320
  },
321
  {
322
  "epoch": 10.51,
323
+ "learning_rate": 0.0009165487500842261,
324
+ "loss": 0.02,
325
+ "step": 34650
 
 
 
 
 
 
326
  },
327
  {
328
  "epoch": 10.81,
329
+ "learning_rate": 0.000910989825483458,
330
+ "loss": 0.0203,
331
+ "step": 35640
 
 
 
 
 
 
332
  },
333
  {
334
  "epoch": 11.0,
335
+ "eval_loss": 0.017732510343194008,
336
+ "eval_max_distance": 54,
337
  "eval_mean_distance": 1,
338
+ "eval_runtime": 0.4814,
339
+ "eval_samples_per_second": 103.858,
340
+ "eval_steps_per_second": 4.154,
341
+ "step": 36278
342
  },
343
  {
344
  "epoch": 11.11,
345
+ "learning_rate": 0.0009054309008826899,
346
+ "loss": 0.0183,
347
+ "step": 36630
 
 
 
 
 
 
348
  },
349
  {
350
  "epoch": 11.41,
351
+ "learning_rate": 0.0008998719762819217,
352
+ "loss": 0.0174,
353
+ "step": 37620
 
 
 
 
 
 
354
  },
355
  {
356
  "epoch": 11.71,
357
+ "learning_rate": 0.0008943130516811536,
358
+ "loss": 0.0178,
359
+ "step": 38610
 
 
 
 
 
 
360
  },
361
  {
362
  "epoch": 12.0,
363
+ "eval_loss": 0.018777821213006973,
364
+ "eval_max_distance": 61,
365
  "eval_mean_distance": 1,
366
+ "eval_runtime": 0.4893,
367
+ "eval_samples_per_second": 102.185,
368
+ "eval_steps_per_second": 4.087,
369
+ "step": 39576
370
  },
371
  {
372
  "epoch": 12.01,
373
+ "learning_rate": 0.0008887541270803853,
374
+ "loss": 0.0174,
375
+ "step": 39600
 
 
 
 
 
 
376
  },
377
  {
378
  "epoch": 12.31,
379
+ "learning_rate": 0.0008831952024796173,
380
+ "loss": 0.0153,
381
+ "step": 40590
 
 
 
 
 
 
382
  },
383
  {
384
  "epoch": 12.61,
385
+ "learning_rate": 0.0008776362778788492,
386
+ "loss": 0.015,
387
+ "step": 41580
 
 
 
 
 
 
388
  },
389
  {
390
  "epoch": 12.91,
391
+ "learning_rate": 0.000872077353278081,
392
+ "loss": 0.0154,
393
+ "step": 42570
394
  },
395
  {
396
  "epoch": 13.0,
397
+ "eval_loss": 0.029613599181175232,
398
+ "eval_max_distance": 65,
399
  "eval_mean_distance": 1,
400
+ "eval_runtime": 0.4669,
401
+ "eval_samples_per_second": 107.079,
402
+ "eval_steps_per_second": 4.283,
403
+ "step": 42874
 
 
 
 
 
 
404
  },
405
  {
406
  "epoch": 13.21,
407
+ "learning_rate": 0.0008665184286773129,
408
+ "loss": 0.014,
409
+ "step": 43560
 
 
 
 
 
 
410
  },
411
  {
412
  "epoch": 13.51,
413
+ "learning_rate": 0.0008609595040765447,
414
+ "loss": 0.0135,
415
+ "step": 44550
 
 
 
 
 
 
416
  },
417
  {
418
  "epoch": 13.81,
419
+ "learning_rate": 0.0008554005794757766,
420
+ "loss": 0.0138,
421
+ "step": 45540
 
 
 
 
 
 
422
  },
423
  {
424
  "epoch": 14.0,
425
+ "eval_loss": 0.02011469565331936,
426
+ "eval_max_distance": 55,
427
  "eval_mean_distance": 1,
428
+ "eval_runtime": 0.5034,
429
+ "eval_samples_per_second": 99.332,
430
+ "eval_steps_per_second": 3.973,
431
+ "step": 46172
432
  },
433
  {
434
  "epoch": 14.11,
435
+ "learning_rate": 0.0008498416548750084,
436
+ "loss": 0.0128,
437
+ "step": 46530
 
 
 
 
 
 
438
  },
439
  {
440
  "epoch": 14.41,
441
+ "learning_rate": 0.0008442827302742403,
442
+ "loss": 0.0121,
443
+ "step": 47520
 
 
 
 
 
 
444
  },
445
  {
446
  "epoch": 14.71,
447
+ "learning_rate": 0.0008387238056734722,
448
+ "loss": 0.012,
449
+ "step": 48510
 
 
 
 
 
 
450
  },
451
  {
452
  "epoch": 15.0,
453
+ "eval_loss": 0.026753582060337067,
454
+ "eval_max_distance": 67,
455
  "eval_mean_distance": 1,
456
+ "eval_runtime": 0.4716,
457
+ "eval_samples_per_second": 106.031,
458
+ "eval_steps_per_second": 4.241,
459
+ "step": 49470
460
  },
461
  {
462
  "epoch": 15.01,
463
+ "learning_rate": 0.000833164881072704,
464
+ "loss": 0.0123,
465
+ "step": 49500
 
 
 
 
 
 
466
  },
467
  {
468
  "epoch": 15.31,
469
+ "learning_rate": 0.0008276059564719359,
470
+ "loss": 0.0104,
471
+ "step": 50490
 
 
 
 
 
 
472
  },
473
  {
474
  "epoch": 15.61,
475
+ "learning_rate": 0.0008220470318711677,
476
+ "loss": 0.0109,
477
+ "step": 51480
 
 
 
 
 
 
478
  },
479
  {
480
  "epoch": 15.91,
481
+ "learning_rate": 0.0008164881072703996,
482
+ "loss": 0.0109,
483
+ "step": 52470
484
  },
485
  {
486
  "epoch": 16.0,
487
+ "eval_loss": 0.01633359119296074,
488
+ "eval_max_distance": 35,
489
  "eval_mean_distance": 1,
490
+ "eval_runtime": 0.4971,
491
+ "eval_samples_per_second": 100.579,
492
+ "eval_steps_per_second": 4.023,
493
+ "step": 52768
 
 
 
 
 
 
494
  },
495
  {
496
  "epoch": 16.21,
497
+ "learning_rate": 0.0008109291826696314,
498
+ "loss": 0.0098,
499
+ "step": 53460
 
 
 
 
 
 
500
  },
501
  {
502
  "epoch": 16.51,
503
+ "learning_rate": 0.0008053702580688633,
504
+ "loss": 0.0094,
505
+ "step": 54450
 
 
 
 
 
 
506
  },
507
  {
508
  "epoch": 16.81,
509
+ "learning_rate": 0.0007998113334680952,
510
+ "loss": 0.0105,
511
+ "step": 55440
 
 
 
 
 
 
512
  },
513
  {
514
  "epoch": 17.0,
515
+ "eval_loss": 0.013592842034995556,
516
+ "eval_max_distance": 26,
517
  "eval_mean_distance": 1,
518
+ "eval_runtime": 0.48,
519
+ "eval_samples_per_second": 104.157,
520
+ "eval_steps_per_second": 4.166,
521
+ "step": 56066
522
  },
523
  {
524
  "epoch": 17.11,
525
+ "learning_rate": 0.000794252408867327,
526
+ "loss": 0.0097,
527
+ "step": 56430
 
 
 
 
 
 
528
  },
529
  {
530
  "epoch": 17.41,
531
+ "learning_rate": 0.0007886934842665589,
532
+ "loss": 0.0083,
533
+ "step": 57420
 
 
 
 
 
 
534
  },
535
  {
536
  "epoch": 17.71,
537
+ "learning_rate": 0.0007831345596657907,
538
+ "loss": 0.0092,
539
+ "step": 58410
 
 
 
 
 
 
540
  },
541
  {
542
  "epoch": 18.0,
543
+ "eval_loss": 0.020196767523884773,
544
+ "eval_max_distance": 65,
545
  "eval_mean_distance": 1,
546
+ "eval_runtime": 0.4567,
547
+ "eval_samples_per_second": 109.487,
548
+ "eval_steps_per_second": 4.379,
549
+ "step": 59364
550
  },
551
  {
552
  "epoch": 18.01,
553
+ "learning_rate": 0.0007775756350650226,
554
+ "loss": 0.009,
555
+ "step": 59400
 
 
 
 
 
 
556
  },
557
  {
558
  "epoch": 18.31,
559
+ "learning_rate": 0.0007720167104642545,
560
+ "loss": 0.0075,
561
+ "step": 60390
 
 
 
 
 
 
562
  },
563
  {
564
  "epoch": 18.61,
565
+ "learning_rate": 0.0007664577858634864,
566
+ "loss": 0.0078,
567
+ "step": 61380
 
 
 
 
 
 
568
  },
569
  {
570
  "epoch": 18.91,
571
+ "learning_rate": 0.0007608988612627181,
572
+ "loss": 0.0087,
573
+ "step": 62370
574
  },
575
  {
576
  "epoch": 19.0,
577
+ "eval_loss": 0.02213277295231819,
578
+ "eval_max_distance": 65,
579
  "eval_mean_distance": 1,
580
+ "eval_runtime": 0.4707,
581
+ "eval_samples_per_second": 106.233,
582
+ "eval_steps_per_second": 4.249,
583
+ "step": 62662
 
 
 
 
 
 
584
  },
585
  {
586
  "epoch": 19.21,
587
+ "learning_rate": 0.00075533993666195,
588
+ "loss": 0.0077,
589
+ "step": 63360
590
  },
591
  {
592
+ "epoch": 19.51,
593
+ "learning_rate": 0.0007497810120611818,
594
+ "loss": 0.0071,
595
+ "step": 64350
 
 
 
 
 
 
596
  },
597
  {
598
+ "epoch": 19.81,
599
+ "learning_rate": 0.0007442220874604138,
600
+ "loss": 0.0075,
601
+ "step": 65340
602
  },
603
  {
604
+ "epoch": 20.0,
605
+ "eval_loss": 0.020336275920271873,
606
+ "eval_max_distance": 33,
607
+ "eval_mean_distance": 1,
608
+ "eval_runtime": 0.4773,
609
+ "eval_samples_per_second": 104.749,
610
+ "eval_steps_per_second": 4.19,
611
+ "step": 65960
612
  },
613
  {
614
+ "epoch": 20.11,
615
+ "learning_rate": 0.0007386631628596457,
616
+ "loss": 0.0073,
617
+ "step": 66330
618
  },
619
  {
620
+ "epoch": 20.41,
621
+ "learning_rate": 0.0007331042382588774,
622
+ "loss": 0.0063,
623
+ "step": 67320
 
 
 
 
624
  },
625
  {
626
+ "epoch": 20.71,
627
+ "learning_rate": 0.0007275453136581093,
628
+ "loss": 0.0067,
629
+ "step": 68310
630
  },
631
  {
632
+ "epoch": 21.0,
633
+ "eval_loss": 0.022562623023986816,
634
+ "eval_max_distance": 26,
635
+ "eval_mean_distance": 1,
636
+ "eval_runtime": 0.5033,
637
+ "eval_samples_per_second": 99.35,
638
+ "eval_steps_per_second": 3.974,
639
+ "step": 69258
640
  },
641
  {
642
+ "epoch": 21.01,
643
+ "learning_rate": 0.0007219863890573411,
644
+ "loss": 0.007,
645
+ "step": 69300
646
  },
647
  {
648
+ "epoch": 21.31,
649
+ "learning_rate": 0.000716427464456573,
650
+ "loss": 0.0061,
651
+ "step": 70290
652
  },
653
  {
654
+ "epoch": 21.61,
655
+ "learning_rate": 0.0007108685398558049,
656
+ "loss": 0.006,
657
+ "step": 71280
658
  },
659
  {
660
+ "epoch": 21.91,
661
+ "learning_rate": 0.0007053096152550368,
662
+ "loss": 0.0062,
663
+ "step": 72270
664
  },
665
  {
666
+ "epoch": 22.0,
667
+ "eval_loss": 0.01839238964021206,
668
+ "eval_max_distance": 24,
669
  "eval_mean_distance": 1,
670
+ "eval_runtime": 0.4856,
671
+ "eval_samples_per_second": 102.959,
672
+ "eval_steps_per_second": 4.118,
673
+ "step": 72556
674
  },
675
  {
676
+ "epoch": 22.21,
677
+ "learning_rate": 0.0006997506906542685,
678
+ "loss": 0.0057,
679
+ "step": 73260
680
  },
681
  {
682
+ "epoch": 22.51,
683
+ "learning_rate": 0.0006941917660535004,
684
+ "loss": 0.0058,
685
+ "step": 74250
686
  },
687
  {
688
+ "epoch": 22.81,
689
+ "learning_rate": 0.0006886328414527323,
690
+ "loss": 0.0059,
691
+ "step": 75240
692
  },
693
  {
694
+ "epoch": 23.0,
695
+ "eval_loss": 0.013111269101500511,
696
+ "eval_max_distance": 18,
697
+ "eval_mean_distance": 0,
698
+ "eval_runtime": 0.5001,
699
+ "eval_samples_per_second": 99.983,
700
+ "eval_steps_per_second": 3.999,
701
+ "step": 75854
702
  },
703
  {
704
+ "epoch": 23.11,
705
+ "learning_rate": 0.0006830739168519642,
706
+ "loss": 0.0055,
707
+ "step": 76230
708
  },
709
  {
710
+ "epoch": 23.41,
711
+ "learning_rate": 0.0006775149922511961,
712
+ "loss": 0.0051,
713
+ "step": 77220
714
  },
715
  {
716
+ "epoch": 23.71,
717
+ "learning_rate": 0.0006719560676504279,
718
+ "loss": 0.0054,
719
+ "step": 78210
720
  },
721
  {
722
+ "epoch": 24.0,
723
+ "eval_loss": 0.026959825307130814,
724
+ "eval_max_distance": 58,
725
  "eval_mean_distance": 1,
726
+ "eval_runtime": 0.4725,
727
+ "eval_samples_per_second": 105.825,
728
+ "eval_steps_per_second": 4.233,
729
+ "step": 79152
730
  },
731
  {
732
+ "epoch": 24.01,
733
+ "learning_rate": 0.0006663971430496597,
734
+ "loss": 0.0055,
735
+ "step": 79200
736
  },
737
  {
738
+ "epoch": 24.31,
739
+ "learning_rate": 0.0006608382184488915,
740
+ "loss": 0.0046,
741
+ "step": 80190
742
  },
743
  {
744
+ "epoch": 24.61,
745
+ "learning_rate": 0.0006552792938481235,
746
+ "loss": 0.005,
747
+ "step": 81180
748
  },
749
  {
750
+ "epoch": 24.92,
751
+ "learning_rate": 0.0006497203692473554,
752
+ "loss": 0.0052,
753
+ "step": 82170
754
  },
755
  {
756
+ "epoch": 25.0,
757
+ "eval_loss": 0.024379713460803032,
758
+ "eval_max_distance": 45,
759
+ "eval_mean_distance": 1,
760
+ "eval_runtime": 0.47,
761
+ "eval_samples_per_second": 106.387,
762
+ "eval_steps_per_second": 4.255,
763
+ "step": 82450
764
  },
765
  {
766
+ "epoch": 25.22,
767
+ "learning_rate": 0.0006441614446465872,
768
+ "loss": 0.0048,
769
+ "step": 83160
770
  },
771
  {
772
+ "epoch": 25.52,
773
+ "learning_rate": 0.000638602520045819,
774
+ "loss": 0.0045,
775
+ "step": 84150
776
  },
777
  {
778
+ "epoch": 25.82,
779
+ "learning_rate": 0.0006330435954450508,
780
+ "loss": 0.0044,
781
+ "step": 85140
782
+ },
783
+ {
784
+ "epoch": 26.0,
785
+ "eval_loss": 0.014908027835190296,
786
+ "eval_max_distance": 23,
787
  "eval_mean_distance": 1,
788
+ "eval_runtime": 0.4819,
789
+ "eval_samples_per_second": 103.748,
790
+ "eval_steps_per_second": 4.15,
791
+ "step": 85748
792
  },
793
  {
794
+ "epoch": 26.12,
795
+ "learning_rate": 0.0006274846708442828,
796
+ "loss": 0.0044,
797
+ "step": 86130
798
  },
799
  {
800
+ "epoch": 26.42,
801
+ "learning_rate": 0.0006219257462435146,
802
+ "loss": 0.0042,
803
+ "step": 87120
804
  },
805
  {
806
+ "epoch": 26.72,
807
+ "learning_rate": 0.0006163668216427465,
808
+ "loss": 0.0043,
809
+ "step": 88110
810
  },
811
  {
812
+ "epoch": 27.0,
813
+ "eval_loss": 0.0256387647241354,
814
+ "eval_max_distance": 63,
815
+ "eval_mean_distance": 1,
816
+ "eval_runtime": 0.5104,
817
+ "eval_samples_per_second": 97.954,
818
+ "eval_steps_per_second": 3.918,
819
+ "step": 89046
820
  },
821
  {
822
+ "epoch": 27.02,
823
+ "learning_rate": 0.0006108078970419783,
824
+ "loss": 0.0043,
825
+ "step": 89100
826
  },
827
  {
828
+ "epoch": 27.32,
829
+ "learning_rate": 0.0006052489724412101,
830
+ "loss": 0.004,
831
+ "step": 90090
832
  },
833
  {
834
+ "epoch": 27.62,
835
+ "learning_rate": 0.0005996900478404421,
836
+ "loss": 0.0037,
837
+ "step": 91080
838
+ },
839
+ {
840
+ "epoch": 27.92,
841
+ "learning_rate": 0.0005941311232396739,
842
+ "loss": 0.0038,
843
+ "step": 92070
844
+ },
845
+ {
846
+ "epoch": 28.0,
847
+ "eval_loss": 0.017227506265044212,
848
+ "eval_max_distance": 30,
849
  "eval_mean_distance": 1,
850
+ "eval_runtime": 0.4632,
851
+ "eval_samples_per_second": 107.934,
852
+ "eval_steps_per_second": 4.317,
853
+ "step": 92344
854
  },
855
  {
856
+ "epoch": 28.22,
857
+ "learning_rate": 0.0005885721986389058,
858
+ "loss": 0.0037,
859
+ "step": 93060
860
  },
861
  {
862
+ "epoch": 28.52,
863
+ "learning_rate": 0.0005830132740381376,
864
+ "loss": 0.0038,
865
+ "step": 94050
866
  },
867
  {
868
+ "epoch": 28.82,
869
+ "learning_rate": 0.0005774543494373694,
870
+ "loss": 0.0036,
871
+ "step": 95040
872
  },
873
  {
874
+ "epoch": 29.0,
875
+ "eval_loss": 0.022354494780302048,
876
+ "eval_max_distance": 37,
877
+ "eval_mean_distance": 1,
878
+ "eval_runtime": 0.4846,
879
+ "eval_samples_per_second": 103.187,
880
+ "eval_steps_per_second": 4.127,
881
+ "step": 95642
882
+ },
883
+ {
884
+ "epoch": 29.12,
885
+ "learning_rate": 0.0005718954248366013,
886
+ "loss": 0.0037,
887
+ "step": 96030
888
+ },
889
+ {
890
+ "epoch": 29.42,
891
+ "learning_rate": 0.0005663365002358332,
892
+ "loss": 0.0033,
893
+ "step": 97020
894
+ },
895
+ {
896
+ "epoch": 29.72,
897
+ "learning_rate": 0.000560777575635065,
898
+ "loss": 0.0033,
899
+ "step": 98010
900
+ },
901
+ {
902
+ "epoch": 30.0,
903
+ "eval_loss": 0.01936698891222477,
904
+ "eval_max_distance": 30,
905
+ "eval_mean_distance": 1,
906
+ "eval_runtime": 0.4829,
907
+ "eval_samples_per_second": 103.544,
908
+ "eval_steps_per_second": 4.142,
909
+ "step": 98940
910
+ },
911
+ {
912
+ "epoch": 30.02,
913
+ "learning_rate": 0.0005552186510342969,
914
+ "loss": 0.0035,
915
+ "step": 99000
916
+ },
917
+ {
918
+ "epoch": 30.32,
919
+ "learning_rate": 0.0005496597264335288,
920
+ "loss": 0.003,
921
+ "step": 99990
922
+ },
923
+ {
924
+ "epoch": 30.62,
925
+ "learning_rate": 0.0005441008018327606,
926
+ "loss": 0.0033,
927
+ "step": 100980
928
+ },
929
+ {
930
+ "epoch": 30.92,
931
+ "learning_rate": 0.0005385418772319925,
932
+ "loss": 0.0031,
933
+ "step": 101970
934
+ },
935
+ {
936
+ "epoch": 31.0,
937
+ "eval_loss": 0.023793019354343414,
938
+ "eval_max_distance": 59,
939
+ "eval_mean_distance": 1,
940
+ "eval_runtime": 0.5012,
941
+ "eval_samples_per_second": 99.754,
942
+ "eval_steps_per_second": 3.99,
943
+ "step": 102238
944
+ },
945
+ {
946
+ "epoch": 31.22,
947
+ "learning_rate": 0.0005329829526312243,
948
+ "loss": 0.0029,
949
+ "step": 102960
950
+ },
951
+ {
952
+ "epoch": 31.52,
953
+ "learning_rate": 0.0005274240280304562,
954
+ "loss": 0.003,
955
+ "step": 103950
956
+ },
957
+ {
958
+ "epoch": 31.82,
959
+ "learning_rate": 0.000521865103429688,
960
+ "loss": 0.003,
961
+ "step": 104940
962
+ },
963
+ {
964
+ "epoch": 32.0,
965
+ "eval_loss": 0.02003033086657524,
966
+ "eval_max_distance": 28,
967
+ "eval_mean_distance": 1,
968
+ "eval_runtime": 0.475,
969
+ "eval_samples_per_second": 105.268,
970
+ "eval_steps_per_second": 4.211,
971
+ "step": 105536
972
+ },
973
+ {
974
+ "epoch": 32.12,
975
+ "learning_rate": 0.00051630617882892,
976
+ "loss": 0.0028,
977
+ "step": 105930
978
  },
979
  {
980
+ "epoch": 32.42,
981
+ "learning_rate": 0.0005107472542281517,
982
  "loss": 0.0027,
983
+ "step": 106920
984
  },
985
  {
986
+ "epoch": 32.72,
987
+ "learning_rate": 0.0005051883296273836,
988
+ "loss": 0.0028,
989
+ "step": 107910
990
  },
991
  {
992
+ "epoch": 33.0,
993
+ "eval_loss": 0.01606147363781929,
994
+ "eval_max_distance": 18,
995
+ "eval_mean_distance": 0,
996
+ "eval_runtime": 0.4673,
997
+ "eval_samples_per_second": 107.008,
998
+ "eval_steps_per_second": 4.28,
999
+ "step": 108834
1000
+ },
1001
+ {
1002
+ "epoch": 33.02,
1003
+ "learning_rate": 0.0004996294050266155,
1004
+ "loss": 0.0028,
1005
+ "step": 108900
1006
+ },
1007
+ {
1008
+ "epoch": 33.32,
1009
+ "learning_rate": 0.0004940704804258473,
1010
  "loss": 0.0026,
1011
+ "step": 109890
1012
  },
1013
  {
1014
+ "epoch": 33.62,
1015
+ "learning_rate": 0.0004885115558250792,
1016
+ "loss": 0.0026,
1017
+ "step": 110880
1018
+ },
1019
+ {
1020
+ "epoch": 33.92,
1021
+ "learning_rate": 0.00048295263122431103,
1022
+ "loss": 0.0027,
1023
+ "step": 111870
1024
+ },
1025
+ {
1026
+ "epoch": 34.0,
1027
+ "eval_loss": 0.021506933495402336,
1028
+ "eval_max_distance": 26,
1029
  "eval_mean_distance": 1,
1030
+ "eval_runtime": 0.4763,
1031
+ "eval_samples_per_second": 104.968,
1032
+ "eval_steps_per_second": 4.199,
1033
+ "step": 112132
1034
  },
1035
  {
1036
+ "epoch": 34.22,
1037
+ "learning_rate": 0.00047739370662354294,
1038
+ "loss": 0.0024,
1039
+ "step": 112860
1040
  },
1041
  {
1042
+ "epoch": 34.52,
1043
+ "learning_rate": 0.00047183478202277474,
1044
  "loss": 0.0023,
1045
+ "step": 113850
1046
  },
1047
  {
1048
+ "epoch": 34.82,
1049
+ "learning_rate": 0.0004662758574220066,
1050
+ "loss": 0.0025,
1051
+ "step": 114840
1052
+ },
1053
+ {
1054
+ "epoch": 35.0,
1055
+ "eval_loss": 0.019841769710183144,
1056
+ "eval_max_distance": 19,
1057
+ "eval_mean_distance": 0,
1058
+ "eval_runtime": 0.4767,
1059
+ "eval_samples_per_second": 104.884,
1060
+ "eval_steps_per_second": 4.195,
1061
+ "step": 115430
1062
+ },
1063
+ {
1064
+ "epoch": 35.12,
1065
+ "learning_rate": 0.00046071693282123845,
1066
  "loss": 0.0023,
1067
+ "step": 115830
1068
  },
1069
  {
1070
+ "epoch": 35.42,
1071
+ "learning_rate": 0.0004551580082204703,
1072
  "loss": 0.0021,
1073
+ "step": 116820
1074
  },
1075
  {
1076
+ "epoch": 35.72,
1077
+ "learning_rate": 0.0004495990836197022,
1078
+ "loss": 0.0023,
1079
+ "step": 117810
1080
  },
1081
  {
1082
+ "epoch": 36.0,
1083
+ "eval_loss": 0.01675160974264145,
1084
+ "eval_max_distance": 24,
1085
+ "eval_mean_distance": 0,
1086
+ "eval_runtime": 0.4591,
1087
+ "eval_samples_per_second": 108.901,
1088
+ "eval_steps_per_second": 4.356,
1089
+ "step": 118728
1090
  },
1091
  {
1092
+ "epoch": 36.02,
1093
+ "learning_rate": 0.000444040159018934,
1094
+ "loss": 0.0023,
1095
+ "step": 118800
1096
  },
1097
  {
1098
+ "epoch": 36.32,
1099
+ "learning_rate": 0.0004384812344181659,
1100
+ "loss": 0.0021,
1101
+ "step": 119790
 
 
 
 
1102
  },
1103
  {
1104
+ "epoch": 36.62,
1105
+ "learning_rate": 0.0004329223098173978,
1106
  "loss": 0.0021,
1107
+ "step": 120780
1108
  },
1109
  {
1110
+ "epoch": 36.92,
1111
+ "learning_rate": 0.0004273633852166296,
1112
+ "loss": 0.002,
1113
+ "step": 121770
1114
  },
1115
  {
1116
+ "epoch": 37.0,
1117
+ "eval_loss": 0.022139811888337135,
1118
+ "eval_max_distance": 32,
1119
+ "eval_mean_distance": 1,
1120
+ "eval_runtime": 0.4713,
1121
+ "eval_samples_per_second": 106.08,
1122
+ "eval_steps_per_second": 4.243,
1123
+ "step": 122026
1124
+ },
1125
+ {
1126
+ "epoch": 37.22,
1127
+ "learning_rate": 0.0004218044606158615,
1128
  "loss": 0.002,
1129
+ "step": 122760
1130
  },
1131
  {
1132
+ "epoch": 37.52,
1133
+ "learning_rate": 0.00041624553601509335,
1134
  "loss": 0.0019,
1135
+ "step": 123750
1136
  },
1137
  {
1138
+ "epoch": 37.82,
1139
+ "learning_rate": 0.00041068661141432515,
1140
+ "loss": 0.0019,
1141
+ "step": 124740
1142
  },
1143
  {
1144
+ "epoch": 38.0,
1145
+ "eval_loss": 0.02140805311501026,
1146
+ "eval_max_distance": 32,
1147
+ "eval_mean_distance": 1,
1148
+ "eval_runtime": 0.4808,
1149
+ "eval_samples_per_second": 104.001,
1150
+ "eval_steps_per_second": 4.16,
1151
+ "step": 125324
1152
  },
1153
  {
1154
+ "epoch": 38.12,
1155
+ "learning_rate": 0.00040512768681355706,
1156
+ "loss": 0.0019,
1157
+ "step": 125730
 
 
 
 
1158
  },
1159
  {
1160
+ "epoch": 38.42,
1161
+ "learning_rate": 0.0003995687622127889,
1162
+ "loss": 0.0018,
1163
+ "step": 126720
1164
  },
1165
  {
1166
+ "epoch": 38.72,
1167
+ "learning_rate": 0.0003940098376120208,
1168
  "loss": 0.0017,
1169
+ "step": 127710
1170
  },
1171
  {
1172
+ "epoch": 39.0,
1173
+ "eval_loss": 0.018618840724229813,
1174
+ "eval_max_distance": 19,
1175
+ "eval_mean_distance": 0,
1176
+ "eval_runtime": 0.4752,
1177
+ "eval_samples_per_second": 105.222,
1178
+ "eval_steps_per_second": 4.209,
1179
+ "step": 128622
1180
  },
1181
  {
1182
+ "epoch": 39.02,
1183
+ "learning_rate": 0.00038845091301125263,
1184
+ "loss": 0.002,
1185
+ "step": 128700
1186
  },
1187
  {
1188
+ "epoch": 39.32,
1189
+ "learning_rate": 0.0003828919884104845,
1190
  "loss": 0.0016,
1191
+ "step": 129690
1192
  },
1193
  {
1194
+ "epoch": 39.62,
1195
+ "learning_rate": 0.00037733306380971634,
1196
+ "loss": 0.0017,
1197
+ "step": 130680
1198
  },
1199
  {
1200
+ "epoch": 39.92,
1201
+ "learning_rate": 0.0003717741392089482,
1202
  "loss": 0.0017,
1203
+ "step": 131670
1204
  },
1205
  {
1206
+ "epoch": 40.0,
1207
+ "eval_loss": 0.017086679115891457,
1208
+ "eval_max_distance": 23,
1209
  "eval_mean_distance": 0,
1210
+ "eval_runtime": 0.458,
1211
+ "eval_samples_per_second": 109.178,
1212
+ "eval_steps_per_second": 4.367,
1213
+ "step": 131920
1214
  },
1215
  {
1216
+ "epoch": 40.22,
1217
+ "learning_rate": 0.00036621521460818,
1218
+ "loss": 0.0015,
1219
+ "step": 132660
1220
  },
1221
  {
1222
+ "epoch": 40.52,
1223
+ "learning_rate": 0.0003606562900074119,
1224
  "loss": 0.0016,
1225
+ "step": 133650
1226
  },
1227
  {
1228
+ "epoch": 40.82,
1229
+ "learning_rate": 0.00035509736540664376,
1230
  "loss": 0.0016,
1231
+ "step": 134640
1232
  },
1233
  {
1234
+ "epoch": 41.0,
1235
+ "eval_loss": 0.01638130471110344,
1236
+ "eval_max_distance": 17,
1237
+ "eval_mean_distance": 0,
1238
+ "eval_runtime": 0.4581,
1239
+ "eval_samples_per_second": 109.147,
1240
+ "eval_steps_per_second": 4.366,
1241
+ "step": 135218
1242
  },
1243
  {
1244
+ "epoch": 41.12,
1245
+ "learning_rate": 0.0003495384408058756,
1246
+ "loss": 0.0015,
1247
+ "step": 135630
1248
  },
1249
  {
1250
+ "epoch": 41.43,
1251
+ "learning_rate": 0.0003439795162051075,
1252
+ "loss": 0.0014,
1253
+ "step": 136620
1254
  },
1255
  {
1256
+ "epoch": 41.73,
1257
+ "learning_rate": 0.00033842059160433933,
1258
+ "loss": 0.0015,
1259
+ "step": 137610
1260
  },
1261
  {
1262
+ "epoch": 42.0,
1263
+ "eval_loss": 0.016585057601332664,
1264
+ "eval_max_distance": 21,
1265
+ "eval_mean_distance": 1,
1266
+ "eval_runtime": 0.479,
1267
+ "eval_samples_per_second": 104.393,
1268
+ "eval_steps_per_second": 4.176,
1269
+ "step": 138516
1270
  },
1271
  {
1272
+ "epoch": 42.03,
1273
+ "learning_rate": 0.0003328616670035712,
1274
+ "loss": 0.0014,
1275
+ "step": 138600
1276
+ },
1277
+ {
1278
+ "epoch": 42.33,
1279
+ "learning_rate": 0.00032730274240280304,
1280
  "loss": 0.0015,
1281
+ "step": 139590
1282
  },
1283
  {
1284
+ "epoch": 42.63,
1285
+ "learning_rate": 0.00032174381780203495,
1286
+ "loss": 0.0015,
1287
+ "step": 140580
1288
  },
1289
  {
1290
+ "epoch": 42.93,
1291
+ "learning_rate": 0.00031618489320126675,
1292
  "loss": 0.0014,
1293
+ "step": 141570
1294
+ },
1295
+ {
1296
+ "epoch": 43.0,
1297
+ "eval_loss": 0.016704820096492767,
1298
+ "eval_max_distance": 21,
1299
+ "eval_mean_distance": 0,
1300
+ "eval_runtime": 0.4809,
1301
+ "eval_samples_per_second": 103.976,
1302
+ "eval_steps_per_second": 4.159,
1303
+ "step": 141814
1304
+ },
1305
+ {
1306
+ "epoch": 43.23,
1307
+ "learning_rate": 0.0003106259686004986,
1308
+ "loss": 0.0011,
1309
+ "step": 142560
1310
  },
1311
  {
1312
+ "epoch": 43.53,
1313
+ "learning_rate": 0.0003050670439997305,
1314
  "loss": 0.0013,
1315
+ "step": 143550
1316
  },
1317
  {
1318
+ "epoch": 43.83,
1319
+ "learning_rate": 0.0002995081193989623,
1320
+ "loss": 0.0019,
1321
+ "step": 144540
1322
  },
1323
  {
1324
+ "epoch": 44.0,
1325
+ "eval_loss": 0.019240867346525192,
1326
+ "eval_max_distance": 32,
1327
+ "eval_mean_distance": 1,
1328
+ "eval_runtime": 0.6494,
1329
+ "eval_samples_per_second": 76.999,
1330
+ "eval_steps_per_second": 3.08,
1331
+ "step": 145112
1332
  },
1333
  {
1334
+ "epoch": 44.13,
1335
+ "learning_rate": 0.00029394919479819423,
1336
+ "loss": 0.0012,
1337
+ "step": 145530
1338
+ },
1339
+ {
1340
+ "epoch": 44.43,
1341
+ "learning_rate": 0.00028839027019742603,
1342
+ "loss": 0.0011,
1343
+ "step": 146520
1344
+ },
1345
+ {
1346
+ "epoch": 44.73,
1347
+ "learning_rate": 0.0002828313455966579,
1348
+ "loss": 0.0011,
1349
+ "step": 147510
1350
+ },
1351
+ {
1352
+ "epoch": 45.0,
1353
+ "eval_loss": 0.02091757208108902,
1354
+ "eval_max_distance": 27,
1355
+ "eval_mean_distance": 1,
1356
+ "eval_runtime": 0.4646,
1357
+ "eval_samples_per_second": 107.608,
1358
+ "eval_steps_per_second": 4.304,
1359
+ "step": 148410
1360
+ },
1361
+ {
1362
+ "epoch": 45.03,
1363
+ "learning_rate": 0.0002772724209958898,
1364
+ "loss": 0.0011,
1365
+ "step": 148500
1366
+ },
1367
+ {
1368
+ "epoch": 45.33,
1369
+ "learning_rate": 0.0002717134963951216,
1370
+ "loss": 0.0011,
1371
+ "step": 149490
1372
+ },
1373
+ {
1374
+ "epoch": 45.63,
1375
+ "learning_rate": 0.0002661545717943535,
1376
+ "loss": 0.001,
1377
+ "step": 150480
1378
+ },
1379
+ {
1380
+ "epoch": 45.93,
1381
+ "learning_rate": 0.00026059564719358537,
1382
+ "loss": 0.0011,
1383
+ "step": 151470
1384
+ },
1385
+ {
1386
+ "epoch": 46.0,
1387
+ "eval_loss": 0.02175173908472061,
1388
+ "eval_max_distance": 23,
1389
  "eval_mean_distance": 0,
1390
+ "eval_runtime": 0.4863,
1391
+ "eval_samples_per_second": 102.827,
1392
+ "eval_steps_per_second": 4.113,
1393
+ "step": 151708
1394
  },
1395
  {
1396
+ "epoch": 46.23,
1397
+ "learning_rate": 0.00025503672259281717,
1398
+ "loss": 0.001,
1399
+ "step": 152460
1400
+ },
1401
+ {
1402
+ "epoch": 46.53,
1403
+ "learning_rate": 0.0002494777979920491,
1404
+ "loss": 0.001,
1405
+ "step": 153450
1406
+ },
1407
+ {
1408
+ "epoch": 46.83,
1409
+ "learning_rate": 0.0002439188733912809,
1410
+ "loss": 0.001,
1411
+ "step": 154440
1412
+ },
1413
+ {
1414
+ "epoch": 47.0,
1415
+ "eval_loss": 0.01951581984758377,
1416
+ "eval_max_distance": 25,
1417
+ "eval_mean_distance": 0,
1418
+ "eval_runtime": 0.4608,
1419
+ "eval_samples_per_second": 108.512,
1420
+ "eval_steps_per_second": 4.34,
1421
+ "step": 155006
1422
+ },
1423
+ {
1424
+ "epoch": 47.13,
1425
+ "learning_rate": 0.0002383599487905128,
1426
+ "loss": 0.001,
1427
+ "step": 155430
1428
+ },
1429
+ {
1430
+ "epoch": 47.43,
1431
+ "learning_rate": 0.00023280102418974464,
1432
+ "loss": 0.0009,
1433
+ "step": 156420
1434
+ },
1435
+ {
1436
+ "epoch": 47.73,
1437
+ "learning_rate": 0.00022724209958897647,
1438
+ "loss": 0.0009,
1439
+ "step": 157410
1440
+ },
1441
+ {
1442
+ "epoch": 48.0,
1443
+ "eval_loss": 0.01657327450811863,
1444
+ "eval_max_distance": 15,
1445
+ "eval_mean_distance": 0,
1446
+ "eval_runtime": 0.4688,
1447
+ "eval_samples_per_second": 106.651,
1448
+ "eval_steps_per_second": 4.266,
1449
+ "step": 158304
1450
+ },
1451
+ {
1452
+ "epoch": 48.03,
1453
+ "learning_rate": 0.00022168317498820833,
1454
+ "loss": 0.0009,
1455
+ "step": 158400
1456
+ },
1457
+ {
1458
+ "epoch": 48.33,
1459
+ "learning_rate": 0.0002161242503874402,
1460
+ "loss": 0.0008,
1461
+ "step": 159390
1462
+ },
1463
+ {
1464
+ "epoch": 48.63,
1465
+ "learning_rate": 0.00021056532578667207,
1466
+ "loss": 0.0008,
1467
+ "step": 160380
1468
+ },
1469
+ {
1470
+ "epoch": 48.93,
1471
+ "learning_rate": 0.00020500640118590392,
1472
+ "loss": 0.0008,
1473
+ "step": 161370
1474
+ },
1475
+ {
1476
+ "epoch": 49.0,
1477
+ "eval_loss": 0.020961837843060493,
1478
+ "eval_max_distance": 31,
1479
+ "eval_mean_distance": 1,
1480
+ "eval_runtime": 0.4893,
1481
+ "eval_samples_per_second": 102.188,
1482
+ "eval_steps_per_second": 4.088,
1483
+ "step": 161602
1484
+ },
1485
+ {
1486
+ "epoch": 49.23,
1487
+ "learning_rate": 0.00019944747658513578,
1488
+ "loss": 0.0008,
1489
+ "step": 162360
1490
+ },
1491
+ {
1492
+ "epoch": 49.53,
1493
+ "learning_rate": 0.00019388855198436764,
1494
+ "loss": 0.0008,
1495
+ "step": 163350
1496
+ },
1497
+ {
1498
+ "epoch": 49.83,
1499
+ "learning_rate": 0.0001883296273835995,
1500
+ "loss": 0.0008,
1501
+ "step": 164340
1502
+ },
1503
+ {
1504
+ "epoch": 50.0,
1505
+ "eval_loss": 0.022983456030488014,
1506
+ "eval_max_distance": 22,
1507
+ "eval_mean_distance": 0,
1508
+ "eval_runtime": 0.479,
1509
+ "eval_samples_per_second": 104.39,
1510
+ "eval_steps_per_second": 4.176,
1511
+ "step": 164900
1512
+ },
1513
+ {
1514
+ "epoch": 50.13,
1515
+ "learning_rate": 0.00018277070278283135,
1516
+ "loss": 0.0008,
1517
+ "step": 165330
1518
+ },
1519
+ {
1520
+ "epoch": 50.43,
1521
+ "learning_rate": 0.0001772117781820632,
1522
+ "loss": 0.0007,
1523
+ "step": 166320
1524
+ },
1525
+ {
1526
+ "epoch": 50.73,
1527
+ "learning_rate": 0.00017165285358129506,
1528
+ "loss": 0.0008,
1529
+ "step": 167310
1530
+ },
1531
+ {
1532
+ "epoch": 51.0,
1533
+ "eval_loss": 0.018444916233420372,
1534
+ "eval_max_distance": 15,
1535
+ "eval_mean_distance": 0,
1536
+ "eval_runtime": 0.4866,
1537
+ "eval_samples_per_second": 102.75,
1538
+ "eval_steps_per_second": 4.11,
1539
+ "step": 168198
1540
+ },
1541
+ {
1542
+ "epoch": 51.03,
1543
+ "learning_rate": 0.00016609392898052691,
1544
+ "loss": 0.0007,
1545
+ "step": 168300
1546
+ },
1547
+ {
1548
+ "epoch": 51.33,
1549
+ "learning_rate": 0.0001605350043797588,
1550
+ "loss": 0.0007,
1551
+ "step": 169290
1552
+ },
1553
+ {
1554
+ "epoch": 51.63,
1555
+ "learning_rate": 0.00015497607977899065,
1556
+ "loss": 0.0007,
1557
+ "step": 170280
1558
+ },
1559
+ {
1560
+ "epoch": 51.93,
1561
+ "learning_rate": 0.00014941715517822248,
1562
+ "loss": 0.0007,
1563
+ "step": 171270
1564
+ },
1565
+ {
1566
+ "epoch": 52.0,
1567
+ "eval_loss": 0.01832015998661518,
1568
+ "eval_max_distance": 15,
1569
+ "eval_mean_distance": 0,
1570
+ "eval_runtime": 0.4672,
1571
+ "eval_samples_per_second": 107.025,
1572
+ "eval_steps_per_second": 4.281,
1573
+ "step": 171496
1574
+ },
1575
+ {
1576
+ "epoch": 52.23,
1577
+ "learning_rate": 0.00014385823057745434,
1578
+ "loss": 0.0006,
1579
+ "step": 172260
1580
+ },
1581
+ {
1582
+ "epoch": 52.53,
1583
+ "learning_rate": 0.00013829930597668622,
1584
+ "loss": 0.0006,
1585
+ "step": 173250
1586
+ },
1587
+ {
1588
+ "epoch": 52.83,
1589
+ "learning_rate": 0.00013274038137591808,
1590
+ "loss": 0.0006,
1591
+ "step": 174240
1592
+ },
1593
+ {
1594
+ "epoch": 53.0,
1595
+ "eval_loss": 0.023398304358124733,
1596
+ "eval_max_distance": 32,
1597
+ "eval_mean_distance": 1,
1598
+ "eval_runtime": 0.4822,
1599
+ "eval_samples_per_second": 103.698,
1600
+ "eval_steps_per_second": 4.148,
1601
+ "step": 174794
1602
+ },
1603
+ {
1604
+ "epoch": 53.13,
1605
+ "learning_rate": 0.0001271814567751499,
1606
+ "loss": 0.0006,
1607
+ "step": 175230
1608
+ },
1609
+ {
1610
+ "epoch": 53.43,
1611
+ "learning_rate": 0.00012162253217438179,
1612
+ "loss": 0.0006,
1613
+ "step": 176220
1614
+ },
1615
+ {
1616
+ "epoch": 53.73,
1617
+ "learning_rate": 0.00011606360757361364,
1618
+ "loss": 0.0005,
1619
+ "step": 177210
1620
+ },
1621
+ {
1622
+ "epoch": 54.0,
1623
+ "eval_loss": 0.022733934223651886,
1624
+ "eval_max_distance": 24,
1625
+ "eval_mean_distance": 0,
1626
+ "eval_runtime": 0.4789,
1627
+ "eval_samples_per_second": 104.41,
1628
+ "eval_steps_per_second": 4.176,
1629
+ "step": 178092
1630
+ },
1631
+ {
1632
+ "epoch": 54.03,
1633
+ "learning_rate": 0.0001105046829728455,
1634
+ "loss": 0.0005,
1635
+ "step": 178200
1636
+ },
1637
+ {
1638
+ "epoch": 54.33,
1639
+ "learning_rate": 0.00010494575837207735,
1640
+ "loss": 0.0005,
1641
+ "step": 179190
1642
+ },
1643
+ {
1644
+ "epoch": 54.63,
1645
+ "learning_rate": 9.938683377130921e-05,
1646
+ "loss": 0.0005,
1647
+ "step": 180180
1648
+ },
1649
+ {
1650
+ "epoch": 54.93,
1651
+ "learning_rate": 9.382790917054107e-05,
1652
+ "loss": 0.0004,
1653
+ "step": 181170
1654
+ },
1655
+ {
1656
+ "epoch": 55.0,
1657
+ "eval_loss": 0.018815917894244194,
1658
+ "eval_max_distance": 15,
1659
+ "eval_mean_distance": 0,
1660
+ "eval_runtime": 0.4798,
1661
+ "eval_samples_per_second": 104.21,
1662
+ "eval_steps_per_second": 4.168,
1663
+ "step": 181390
1664
+ },
1665
+ {
1666
+ "epoch": 55.23,
1667
+ "learning_rate": 8.826898456977294e-05,
1668
+ "loss": 0.0005,
1669
+ "step": 182160
1670
+ },
1671
+ {
1672
+ "epoch": 55.53,
1673
+ "learning_rate": 8.271005996900478e-05,
1674
+ "loss": 0.0004,
1675
+ "step": 183150
1676
+ },
1677
+ {
1678
+ "epoch": 55.83,
1679
+ "learning_rate": 7.715113536823665e-05,
1680
+ "loss": 0.0005,
1681
+ "step": 184140
1682
+ },
1683
+ {
1684
+ "epoch": 56.0,
1685
+ "eval_loss": 0.01906018890440464,
1686
+ "eval_max_distance": 15,
1687
+ "eval_mean_distance": 0,
1688
+ "eval_runtime": 0.48,
1689
+ "eval_samples_per_second": 104.168,
1690
+ "eval_steps_per_second": 4.167,
1691
+ "step": 184688
1692
+ },
1693
+ {
1694
+ "epoch": 56.13,
1695
+ "learning_rate": 7.15922107674685e-05,
1696
+ "loss": 0.0004,
1697
+ "step": 185130
1698
+ },
1699
+ {
1700
+ "epoch": 56.43,
1701
+ "learning_rate": 6.603328616670036e-05,
1702
+ "loss": 0.0004,
1703
+ "step": 186120
1704
+ },
1705
+ {
1706
+ "epoch": 56.73,
1707
+ "learning_rate": 6.0474361565932214e-05,
1708
+ "loss": 0.0004,
1709
+ "step": 187110
1710
+ },
1711
+ {
1712
+ "epoch": 57.0,
1713
+ "eval_loss": 0.018282707780599594,
1714
+ "eval_max_distance": 15,
1715
+ "eval_mean_distance": 0,
1716
+ "eval_runtime": 0.4797,
1717
+ "eval_samples_per_second": 104.233,
1718
+ "eval_steps_per_second": 4.169,
1719
+ "step": 187986
1720
+ },
1721
+ {
1722
+ "epoch": 57.03,
1723
+ "learning_rate": 5.491543696516407e-05,
1724
+ "loss": 0.0004,
1725
+ "step": 188100
1726
+ },
1727
+ {
1728
+ "epoch": 57.33,
1729
+ "learning_rate": 4.935651236439593e-05,
1730
+ "loss": 0.0004,
1731
+ "step": 189090
1732
+ },
1733
+ {
1734
+ "epoch": 57.63,
1735
+ "learning_rate": 4.379758776362779e-05,
1736
+ "loss": 0.0004,
1737
+ "step": 190080
1738
+ },
1739
+ {
1740
+ "epoch": 57.94,
1741
+ "learning_rate": 3.823866316285965e-05,
1742
+ "loss": 0.0003,
1743
+ "step": 191070
1744
+ },
1745
+ {
1746
+ "epoch": 58.0,
1747
+ "eval_loss": 0.018019111827015877,
1748
+ "eval_max_distance": 15,
1749
+ "eval_mean_distance": 0,
1750
+ "eval_runtime": 0.4619,
1751
+ "eval_samples_per_second": 108.242,
1752
+ "eval_steps_per_second": 4.33,
1753
+ "step": 191284
1754
+ },
1755
+ {
1756
+ "epoch": 58.24,
1757
+ "learning_rate": 3.2679738562091506e-05,
1758
+ "loss": 0.0004,
1759
+ "step": 192060
1760
+ },
1761
+ {
1762
+ "epoch": 58.54,
1763
+ "learning_rate": 2.7120813961323362e-05,
1764
+ "loss": 0.0004,
1765
+ "step": 193050
1766
+ },
1767
+ {
1768
+ "epoch": 58.84,
1769
+ "learning_rate": 2.1561889360555218e-05,
1770
+ "loss": 0.0003,
1771
+ "step": 194040
1772
+ },
1773
+ {
1774
+ "epoch": 59.0,
1775
+ "eval_loss": 0.01795811764895916,
1776
+ "eval_max_distance": 15,
1777
+ "eval_mean_distance": 0,
1778
+ "eval_runtime": 0.475,
1779
+ "eval_samples_per_second": 105.265,
1780
+ "eval_steps_per_second": 4.211,
1781
+ "step": 194582
1782
+ },
1783
+ {
1784
+ "epoch": 59.14,
1785
+ "learning_rate": 1.6002964759787074e-05,
1786
+ "loss": 0.0004,
1787
+ "step": 195030
1788
+ },
1789
+ {
1790
+ "epoch": 59.44,
1791
+ "learning_rate": 1.0444040159018933e-05,
1792
+ "loss": 0.0004,
1793
+ "step": 196020
1794
+ },
1795
+ {
1796
+ "epoch": 59.74,
1797
+ "learning_rate": 4.885115558250792e-06,
1798
+ "loss": 0.0004,
1799
+ "step": 197010
1800
+ },
1801
+ {
1802
+ "epoch": 60.0,
1803
+ "eval_loss": 0.017678335309028625,
1804
+ "eval_max_distance": 15,
1805
+ "eval_mean_distance": 0,
1806
+ "eval_runtime": 0.4798,
1807
+ "eval_samples_per_second": 104.214,
1808
+ "eval_steps_per_second": 4.169,
1809
+ "step": 197880
1810
+ },
1811
+ {
1812
+ "epoch": 60.0,
1813
+ "step": 197880,
1814
+ "total_flos": 1.1400109636858675e+17,
1815
+ "train_loss": 0.031872519274052644,
1816
+ "train_runtime": 16366.2485,
1817
+ "train_samples_per_second": 362.656,
1818
+ "train_steps_per_second": 12.091
1819
  }
1820
  ],
1821
+ "logging_steps": 990,
1822
+ "max_steps": 197880,
1823
+ "num_train_epochs": 60,
1824
+ "save_steps": 1979,
1825
+ "total_flos": 1.1400109636858675e+17,
1826
  "trial_name": null,
1827
  "trial_params": null
1828
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8559fb7fc2610f478f8fb2eefabd395825b089696477a52ed7cf7234f686c78f
3
  size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970254644cb218db4599e9310f1083ff5880c007630cc4c6dbec952da37dd2a9
3
  size 4091