alexue4 commited on
Commit
81e89b6
1 Parent(s): 11562f2

End of training

Browse files
Files changed (6) hide show
  1. README.md +28 -68
  2. config.json +1 -1
  3. pytorch_model.bin +1 -1
  4. special_tokens_map.json +102 -0
  5. trainer_state.json +945 -1345
  6. training_args.bin +1 -1
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  license: mit
3
- base_model: cointegrated/rut5-small
4
  tags:
5
  - generated_from_trainer
6
  model-index:
@@ -13,11 +13,11 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # text-normalization-ru-new
15
 
16
- This model is a fine-tuned version of [cointegrated/rut5-small](https://huggingface.co/cointegrated/rut5-small) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0318
19
  - Mean Distance: 0
20
- - Max Distance: 11
21
 
22
  ## Model description
23
 
@@ -36,79 +36,39 @@ More information needed
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
- - learning_rate: 0.001
40
  - train_batch_size: 30
41
  - eval_batch_size: 30
42
  - seed: 42
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
- - num_epochs: 60
47
 
48
  ### Training results
49
 
50
- | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
- |:-------------:|:-----:|:------:|:---------------:|:-------------:|:------------:|
52
- | 0.2251 | 1.0 | 3334 | 0.1190 | 3 | 29 |
53
- | 0.1179 | 2.0 | 6668 | 0.0574 | 2 | 31 |
54
- | 0.0848 | 3.0 | 10002 | 0.0436 | 1 | 15 |
55
- | 0.0618 | 4.0 | 13336 | 0.0359 | 1 | 20 |
56
- | 0.0532 | 5.0 | 16670 | 0.0315 | 0 | 11 |
57
- | 0.0446 | 6.0 | 20004 | 0.0299 | 0 | 16 |
58
- | 0.0388 | 7.0 | 23338 | 0.0295 | 0 | 15 |
59
- | 0.0311 | 8.0 | 26672 | 0.0287 | 0 | 15 |
60
- | 0.0269 | 9.0 | 30006 | 0.0241 | 0 | 15 |
61
- | 0.0232 | 10.0 | 33340 | 0.0228 | 0 | 13 |
62
- | 0.0203 | 11.0 | 36674 | 0.0243 | 0 | 16 |
63
- | 0.0173 | 12.0 | 40008 | 0.0250 | 0 | 15 |
64
- | 0.0151 | 13.0 | 43342 | 0.0244 | 0 | 9 |
65
- | 0.0136 | 14.0 | 46676 | 0.0234 | 0 | 15 |
66
- | 0.0123 | 15.0 | 50010 | 0.0221 | 0 | 9 |
67
- | 0.0113 | 16.0 | 53344 | 0.0244 | 0 | 12 |
68
- | 0.01 | 17.0 | 56678 | 0.0226 | 0 | 13 |
69
- | 0.0089 | 18.0 | 60012 | 0.0271 | 0 | 13 |
70
- | 0.0085 | 19.0 | 63346 | 0.0248 | 0 | 13 |
71
- | 0.0074 | 20.0 | 66680 | 0.0277 | 0 | 12 |
72
- | 0.007 | 21.0 | 70014 | 0.0309 | 0 | 13 |
73
- | 0.0066 | 22.0 | 73348 | 0.0306 | 0 | 11 |
74
- | 0.0056 | 23.0 | 76682 | 0.0287 | 0 | 10 |
75
- | 0.0053 | 24.0 | 80016 | 0.0312 | 0 | 12 |
76
- | 0.0049 | 25.0 | 83350 | 0.0276 | 0 | 11 |
77
- | 0.0053 | 26.0 | 86684 | 0.0308 | 0 | 10 |
78
- | 0.0041 | 27.0 | 90018 | 0.0279 | 0 | 10 |
79
- | 0.0041 | 28.0 | 93352 | 0.0292 | 0 | 11 |
80
- | 0.0037 | 29.0 | 96686 | 0.0306 | 0 | 11 |
81
- | 0.0035 | 30.0 | 100020 | 0.0272 | 0 | 12 |
82
- | 0.0032 | 31.0 | 103354 | 0.0255 | 0 | 9 |
83
- | 0.0031 | 32.0 | 106688 | 0.0293 | 0 | 10 |
84
- | 0.0029 | 33.0 | 110022 | 0.0300 | 0 | 13 |
85
- | 0.0026 | 34.0 | 113356 | 0.0305 | 0 | 11 |
86
- | 0.0024 | 35.0 | 116690 | 0.0273 | 0 | 9 |
87
- | 0.0023 | 36.0 | 120024 | 0.0284 | 0 | 10 |
88
- | 0.0022 | 37.0 | 123358 | 0.0313 | 0 | 13 |
89
- | 0.002 | 38.0 | 126692 | 0.0341 | 0 | 13 |
90
- | 0.0017 | 39.0 | 130026 | 0.0301 | 0 | 13 |
91
- | 0.0017 | 40.0 | 133360 | 0.0330 | 0 | 11 |
92
- | 0.0016 | 41.0 | 136694 | 0.0344 | 0 | 11 |
93
- | 0.0014 | 42.0 | 140028 | 0.0337 | 0 | 10 |
94
- | 0.0013 | 43.0 | 143362 | 0.0292 | 0 | 12 |
95
- | 0.0012 | 44.0 | 146696 | 0.0339 | 0 | 11 |
96
- | 0.0012 | 45.0 | 150030 | 0.0330 | 0 | 11 |
97
- | 0.001 | 46.0 | 153364 | 0.0307 | 0 | 11 |
98
- | 0.001 | 47.0 | 156698 | 0.0330 | 0 | 10 |
99
- | 0.0009 | 48.0 | 160032 | 0.0338 | 0 | 11 |
100
- | 0.0009 | 49.0 | 163366 | 0.0288 | 0 | 10 |
101
- | 0.0008 | 50.0 | 166700 | 0.0256 | 0 | 10 |
102
- | 0.0007 | 51.0 | 170034 | 0.0284 | 0 | 11 |
103
- | 0.0006 | 52.0 | 173368 | 0.0342 | 0 | 10 |
104
- | 0.0006 | 53.0 | 176702 | 0.0312 | 0 | 10 |
105
- | 0.0005 | 54.0 | 180036 | 0.0326 | 0 | 10 |
106
- | 0.0005 | 55.0 | 183370 | 0.0304 | 0 | 11 |
107
- | 0.0005 | 56.0 | 186704 | 0.0300 | 0 | 11 |
108
- | 0.0004 | 57.0 | 190038 | 0.0313 | 0 | 11 |
109
- | 0.0003 | 58.0 | 193372 | 0.0321 | 0 | 11 |
110
- | 0.0003 | 59.0 | 196706 | 0.0316 | 0 | 10 |
111
- | 0.0004 | 60.0 | 200040 | 0.0318 | 0 | 11 |
112
 
113
 
114
  ### Framework versions
 
1
  ---
2
  license: mit
3
+ base_model: alexue4/text-normalization-ru-new
4
  tags:
5
  - generated_from_trainer
6
  model-index:
 
13
 
14
  # text-normalization-ru-new
15
 
16
+ This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0007
19
  - Mean Distance: 0
20
+ - Max Distance: 3
21
 
22
  ## Model description
23
 
 
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
+ - learning_rate: 0.0001
40
  - train_batch_size: 30
41
  - eval_batch_size: 30
42
  - seed: 42
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 20
47
 
48
  ### Training results
49
 
50
+ | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
+ |:-------------:|:-----:|:-----:|:---------------:|:-------------:|:------------:|
52
+ | 0.0028 | 1.0 | 3443 | 0.0015 | 0 | 3 |
53
+ | 0.0019 | 2.0 | 6886 | 0.0009 | 0 | 3 |
54
+ | 0.0016 | 3.0 | 10329 | 0.0013 | 0 | 3 |
55
+ | 0.0013 | 4.0 | 13772 | 0.0008 | 0 | 1 |
56
+ | 0.0012 | 5.0 | 17215 | 0.0011 | 0 | 3 |
57
+ | 0.0009 | 6.0 | 20658 | 0.0009 | 0 | 3 |
58
+ | 0.0008 | 7.0 | 24101 | 0.0011 | 0 | 3 |
59
+ | 0.0007 | 8.0 | 27544 | 0.0010 | 0 | 3 |
60
+ | 0.0006 | 9.0 | 30987 | 0.0012 | 0 | 3 |
61
+ | 0.0006 | 10.0 | 34430 | 0.0008 | 0 | 3 |
62
+ | 0.0006 | 11.0 | 37873 | 0.0005 | 0 | 0 |
63
+ | 0.0005 | 12.0 | 41316 | 0.0007 | 0 | 1 |
64
+ | 0.0004 | 13.0 | 44759 | 0.0007 | 0 | 0 |
65
+ | 0.0006 | 14.0 | 48202 | 0.0011 | 0 | 3 |
66
+ | 0.0005 | 15.0 | 51645 | 0.0008 | 0 | 3 |
67
+ | 0.0005 | 16.0 | 55088 | 0.0008 | 0 | 3 |
68
+ | 0.0005 | 17.0 | 58531 | 0.0008 | 0 | 3 |
69
+ | 0.0004 | 18.0 | 61974 | 0.0007 | 0 | 3 |
70
+ | 0.0004 | 19.0 | 65417 | 0.0007 | 0 | 3 |
71
+ | 0.0005 | 20.0 | 68860 | 0.0007 | 0 | 3 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
 
74
  ### Framework versions
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "cointegrated/rut5-small",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
 
1
  {
2
+ "_name_or_path": "alexue4/text-normalization-ru-new",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:640ab93c6e6932ab1eb56e93439e8e20cf9ed1484ccd6ca0aa7250c2acf8ab00
3
  size 258643461
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:426d8940304254b400c7865e3f92b2ed60ec87d2cec52df3644476d19c0451e2
3
  size 258643461
special_tokens_map.json CHANGED
@@ -1,4 +1,106 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "eos_token": "</s>",
3
  "pad_token": "<pad>",
4
  "unk_token": "<unk>"
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
  "eos_token": "</s>",
105
  "pad_token": "<pad>",
106
  "unk_token": "<unk>"
trainer_state.json CHANGED
@@ -1,1828 +1,1428 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 60.0,
5
  "eval_steps": 500,
6
- "global_step": 200040,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 4.999000199960008e-08,
14
- "loss": 13.1619,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.3,
19
- "learning_rate": 5.003999200159968e-05,
20
- "loss": 3.3531,
21
- "step": 1001
22
- },
23
- {
24
- "epoch": 0.6,
25
- "learning_rate": 0.00010007998400319936,
26
- "loss": 0.3338,
27
- "step": 2002
28
  },
29
  {
30
- "epoch": 0.9,
31
- "learning_rate": 0.00015011997600479905,
32
- "loss": 0.2251,
33
- "step": 3003
34
- },
35
- {
36
- "epoch": 1.0,
37
- "eval_loss": 0.118980273604393,
38
- "eval_max_distance": 29,
39
- "eval_mean_distance": 3,
40
- "eval_runtime": 0.3245,
41
- "eval_samples_per_second": 154.076,
42
- "eval_steps_per_second": 6.163,
43
- "step": 3334
44
  },
45
  {
46
- "epoch": 1.2,
47
- "learning_rate": 0.00020015996800639872,
48
- "loss": 0.1668,
49
- "step": 4004
50
- },
51
- {
52
- "epoch": 1.5,
53
- "learning_rate": 0.0002501999600079984,
54
- "loss": 0.1375,
55
- "step": 5005
56
- },
57
- {
58
- "epoch": 1.8,
59
- "learning_rate": 0.0003002399520095981,
60
- "loss": 0.1179,
61
- "step": 6006
62
  },
63
  {
64
- "epoch": 2.0,
65
- "eval_loss": 0.057394467294216156,
66
- "eval_max_distance": 31,
67
- "eval_mean_distance": 2,
68
- "eval_runtime": 0.2749,
69
- "eval_samples_per_second": 181.873,
70
- "eval_steps_per_second": 7.275,
71
- "step": 6668
72
  },
73
  {
74
- "epoch": 2.1,
75
- "learning_rate": 0.00035027994401119777,
76
- "loss": 0.0992,
77
- "step": 7007
78
  },
79
  {
80
- "epoch": 2.4,
81
- "learning_rate": 0.00040031993601279744,
82
- "loss": 0.0886,
83
- "step": 8008
84
  },
85
  {
86
- "epoch": 2.7,
87
- "learning_rate": 0.0004503599280143971,
88
- "loss": 0.0848,
89
- "step": 9009
90
  },
91
  {
92
- "epoch": 3.0,
93
- "eval_loss": 0.043563079088926315,
94
- "eval_max_distance": 15,
95
- "eval_mean_distance": 1,
96
- "eval_runtime": 0.2795,
97
- "eval_samples_per_second": 178.881,
98
- "eval_steps_per_second": 7.155,
99
- "step": 10002
100
  },
101
  {
102
- "epoch": 3.0,
103
- "learning_rate": 0.0005003999200159968,
104
- "loss": 0.0757,
105
- "step": 10010
106
  },
107
  {
108
- "epoch": 3.3,
109
- "learning_rate": 0.0005504399120175964,
110
- "loss": 0.0681,
111
- "step": 11011
 
 
 
 
112
  },
113
  {
114
- "epoch": 3.6,
115
- "learning_rate": 0.0006004799040191962,
116
- "loss": 0.0653,
117
- "step": 12012
118
  },
119
  {
120
- "epoch": 3.9,
121
- "learning_rate": 0.0006505198960207959,
122
- "loss": 0.0618,
123
- "step": 13013
124
  },
125
  {
126
- "epoch": 4.0,
127
- "eval_loss": 0.035945579409599304,
128
- "eval_max_distance": 20,
129
- "eval_mean_distance": 1,
130
- "eval_runtime": 0.2802,
131
- "eval_samples_per_second": 178.422,
132
- "eval_steps_per_second": 7.137,
133
- "step": 13336
134
  },
135
  {
136
- "epoch": 4.2,
137
- "learning_rate": 0.0007005598880223955,
138
- "loss": 0.0564,
139
- "step": 14014
140
  },
141
  {
142
- "epoch": 4.5,
143
- "learning_rate": 0.0007505998800239953,
144
- "loss": 0.0537,
145
- "step": 15015
146
  },
147
  {
148
- "epoch": 4.8,
149
- "learning_rate": 0.0008006398720255949,
150
- "loss": 0.0532,
151
- "step": 16016
152
  },
153
  {
154
- "epoch": 5.0,
155
- "eval_loss": 0.031485434621572495,
156
- "eval_max_distance": 11,
157
- "eval_mean_distance": 0,
158
- "eval_runtime": 0.2717,
159
- "eval_samples_per_second": 184.007,
160
- "eval_steps_per_second": 7.36,
161
- "step": 16670
162
  },
163
  {
164
- "epoch": 5.1,
165
- "learning_rate": 0.0008506798640271945,
166
- "loss": 0.05,
167
- "step": 17017
168
  },
169
  {
170
- "epoch": 5.4,
171
- "learning_rate": 0.0009007198560287942,
172
- "loss": 0.0468,
173
- "step": 18018
174
  },
175
  {
176
- "epoch": 5.7,
177
- "learning_rate": 0.000950759848030394,
178
- "loss": 0.0446,
179
- "step": 19019
180
  },
181
  {
182
- "epoch": 6.0,
183
- "eval_loss": 0.0298615675419569,
184
- "eval_max_distance": 16,
185
  "eval_mean_distance": 0,
186
- "eval_runtime": 0.2573,
187
- "eval_samples_per_second": 194.348,
188
- "eval_steps_per_second": 7.774,
189
- "step": 20004
190
- },
191
- {
192
- "epoch": 6.0,
193
- "learning_rate": 0.000999911128885334,
194
- "loss": 0.0465,
195
- "step": 20020
196
- },
197
- {
198
- "epoch": 6.31,
199
- "learning_rate": 0.0009943511297740451,
200
- "loss": 0.0384,
201
- "step": 21021
202
  },
203
  {
204
- "epoch": 6.61,
205
- "learning_rate": 0.0009887911306627564,
206
- "loss": 0.0378,
207
- "step": 22022
208
  },
209
  {
210
- "epoch": 6.91,
211
- "learning_rate": 0.0009832311315514674,
212
- "loss": 0.0388,
213
- "step": 23023
214
  },
215
  {
216
- "epoch": 7.0,
217
- "eval_loss": 0.029532546177506447,
218
- "eval_max_distance": 15,
219
- "eval_mean_distance": 0,
220
- "eval_runtime": 0.2674,
221
- "eval_samples_per_second": 187.01,
222
- "eval_steps_per_second": 7.48,
223
- "step": 23338
224
  },
225
  {
226
- "epoch": 7.21,
227
- "learning_rate": 0.0009776711324401787,
228
- "loss": 0.0336,
229
- "step": 24024
230
  },
231
  {
232
- "epoch": 7.51,
233
- "learning_rate": 0.0009721111333288898,
234
- "loss": 0.032,
235
- "step": 25025
236
  },
237
  {
238
- "epoch": 7.81,
239
- "learning_rate": 0.000966551134217601,
240
- "loss": 0.0311,
241
- "step": 26026
242
  },
243
  {
244
- "epoch": 8.0,
245
- "eval_loss": 0.02873826026916504,
246
- "eval_max_distance": 15,
247
- "eval_mean_distance": 0,
248
- "eval_runtime": 0.2674,
249
- "eval_samples_per_second": 186.98,
250
- "eval_steps_per_second": 7.479,
251
- "step": 26672
252
  },
253
  {
254
- "epoch": 8.11,
255
- "learning_rate": 0.0009609911351063121,
256
- "loss": 0.0304,
257
- "step": 27027
258
  },
259
  {
260
- "epoch": 8.41,
261
- "learning_rate": 0.0009554311359950233,
262
- "loss": 0.0267,
263
- "step": 28028
264
  },
265
  {
266
- "epoch": 8.71,
267
- "learning_rate": 0.0009498711368837344,
268
- "loss": 0.0269,
269
- "step": 29029
270
  },
271
  {
272
- "epoch": 9.0,
273
- "eval_loss": 0.02408006228506565,
274
- "eval_max_distance": 15,
275
  "eval_mean_distance": 0,
276
- "eval_runtime": 0.2548,
277
- "eval_samples_per_second": 196.242,
278
- "eval_steps_per_second": 7.85,
279
- "step": 30006
280
- },
281
- {
282
- "epoch": 9.01,
283
- "learning_rate": 0.0009443111377724454,
284
- "loss": 0.0269,
285
- "step": 30030
286
  },
287
  {
288
- "epoch": 9.31,
289
- "learning_rate": 0.0009387511386611567,
290
- "loss": 0.022,
291
- "step": 31031
292
- },
293
- {
294
- "epoch": 9.61,
295
- "learning_rate": 0.0009331911395498677,
296
- "loss": 0.0231,
297
- "step": 32032
298
  },
299
  {
300
- "epoch": 9.91,
301
- "learning_rate": 0.000927631140438579,
302
- "loss": 0.0232,
303
- "step": 33033
304
  },
305
  {
306
- "epoch": 10.0,
307
- "eval_loss": 0.022765493020415306,
308
- "eval_max_distance": 13,
309
- "eval_mean_distance": 0,
310
- "eval_runtime": 0.2488,
311
- "eval_samples_per_second": 200.959,
312
- "eval_steps_per_second": 8.038,
313
- "step": 33340
314
  },
315
  {
316
- "epoch": 10.21,
317
- "learning_rate": 0.00092207114132729,
318
- "loss": 0.0199,
319
- "step": 34034
320
  },
321
  {
322
- "epoch": 10.51,
323
- "learning_rate": 0.0009165111422160013,
324
- "loss": 0.0196,
325
- "step": 35035
326
  },
327
  {
328
- "epoch": 10.81,
329
- "learning_rate": 0.0009109511431047123,
330
- "loss": 0.0203,
331
- "step": 36036
332
  },
333
  {
334
- "epoch": 11.0,
335
- "eval_loss": 0.024308495223522186,
336
- "eval_max_distance": 16,
337
- "eval_mean_distance": 0,
338
- "eval_runtime": 0.2617,
339
- "eval_samples_per_second": 191.039,
340
- "eval_steps_per_second": 7.642,
341
- "step": 36674
342
  },
343
  {
344
- "epoch": 11.11,
345
- "learning_rate": 0.0009053911439934236,
346
- "loss": 0.0186,
347
- "step": 37037
348
  },
349
  {
350
- "epoch": 11.41,
351
- "learning_rate": 0.0008998311448821347,
352
- "loss": 0.0167,
353
- "step": 38038
354
  },
355
  {
356
- "epoch": 11.71,
357
- "learning_rate": 0.0008942711457708459,
358
- "loss": 0.0173,
359
- "step": 39039
360
  },
361
  {
362
- "epoch": 12.0,
363
- "eval_loss": 0.0250206608325243,
364
- "eval_max_distance": 15,
365
  "eval_mean_distance": 0,
366
- "eval_runtime": 0.2565,
367
- "eval_samples_per_second": 194.951,
368
- "eval_steps_per_second": 7.798,
369
- "step": 40008
370
- },
371
- {
372
- "epoch": 12.01,
373
- "learning_rate": 0.000888711146659557,
374
- "loss": 0.0178,
375
- "step": 40040
376
  },
377
  {
378
- "epoch": 12.31,
379
- "learning_rate": 0.0008831511475482682,
380
- "loss": 0.0146,
381
- "step": 41041
382
- },
383
- {
384
- "epoch": 12.61,
385
- "learning_rate": 0.0008775911484369793,
386
- "loss": 0.0149,
387
- "step": 42042
388
  },
389
  {
390
- "epoch": 12.91,
391
- "learning_rate": 0.0008720311493256904,
392
- "loss": 0.0151,
393
- "step": 43043
394
  },
395
  {
396
- "epoch": 13.0,
397
- "eval_loss": 0.024401402100920677,
398
- "eval_max_distance": 9,
399
- "eval_mean_distance": 0,
400
- "eval_runtime": 0.2582,
401
- "eval_samples_per_second": 193.662,
402
- "eval_steps_per_second": 7.746,
403
- "step": 43342
404
  },
405
  {
406
- "epoch": 13.21,
407
- "learning_rate": 0.0008664711502144016,
408
- "loss": 0.0138,
409
- "step": 44044
410
  },
411
  {
412
- "epoch": 13.51,
413
- "learning_rate": 0.0008609111511031127,
414
- "loss": 0.0137,
415
- "step": 45045
416
  },
417
  {
418
- "epoch": 13.81,
419
- "learning_rate": 0.0008553511519918239,
420
- "loss": 0.0136,
421
- "step": 46046
422
  },
423
  {
424
- "epoch": 14.0,
425
- "eval_loss": 0.023412013426423073,
426
- "eval_max_distance": 15,
427
- "eval_mean_distance": 0,
428
- "eval_runtime": 0.2465,
429
- "eval_samples_per_second": 202.834,
430
- "eval_steps_per_second": 8.113,
431
- "step": 46676
432
  },
433
  {
434
- "epoch": 14.11,
435
- "learning_rate": 0.000849791152880535,
436
- "loss": 0.0126,
437
- "step": 47047
438
  },
439
  {
440
- "epoch": 14.41,
441
- "learning_rate": 0.0008442311537692462,
442
- "loss": 0.0121,
443
- "step": 48048
444
  },
445
  {
446
- "epoch": 14.71,
447
- "learning_rate": 0.0008386711546579573,
448
- "loss": 0.0123,
449
- "step": 49049
450
  },
451
  {
452
- "epoch": 15.0,
453
- "eval_loss": 0.022092605009675026,
454
- "eval_max_distance": 9,
455
  "eval_mean_distance": 0,
456
- "eval_runtime": 0.2607,
457
- "eval_samples_per_second": 191.77,
458
- "eval_steps_per_second": 7.671,
459
- "step": 50010
460
  },
461
  {
462
- "epoch": 15.01,
463
- "learning_rate": 0.0008331111555466685,
464
- "loss": 0.0125,
465
- "step": 50050
466
- },
467
- {
468
- "epoch": 15.31,
469
- "learning_rate": 0.0008275511564353796,
470
- "loss": 0.0101,
471
- "step": 51051
472
- },
473
- {
474
- "epoch": 15.61,
475
- "learning_rate": 0.0008219911573240908,
476
- "loss": 0.0108,
477
- "step": 52052
478
  },
479
  {
480
- "epoch": 15.91,
481
- "learning_rate": 0.0008164311582128019,
482
- "loss": 0.0113,
483
- "step": 53053
484
  },
485
  {
486
- "epoch": 16.0,
487
- "eval_loss": 0.024386152625083923,
488
- "eval_max_distance": 12,
489
- "eval_mean_distance": 0,
490
- "eval_runtime": 0.2455,
491
- "eval_samples_per_second": 203.682,
492
- "eval_steps_per_second": 8.147,
493
- "step": 53344
494
  },
495
  {
496
- "epoch": 16.21,
497
- "learning_rate": 0.0008108711591015131,
498
- "loss": 0.0099,
499
- "step": 54054
500
  },
501
  {
502
- "epoch": 16.51,
503
- "learning_rate": 0.0008053111599902242,
504
- "loss": 0.0096,
505
- "step": 55055
506
  },
507
  {
508
- "epoch": 16.81,
509
- "learning_rate": 0.0007997511608789353,
510
- "loss": 0.01,
511
- "step": 56056
512
  },
513
  {
514
- "epoch": 17.0,
515
- "eval_loss": 0.02255043014883995,
516
- "eval_max_distance": 13,
517
- "eval_mean_distance": 0,
518
- "eval_runtime": 0.2506,
519
- "eval_samples_per_second": 199.486,
520
- "eval_steps_per_second": 7.979,
521
- "step": 56678
522
  },
523
  {
524
- "epoch": 17.11,
525
- "learning_rate": 0.0007941911617676465,
526
- "loss": 0.0093,
527
- "step": 57057
528
  },
529
  {
530
- "epoch": 17.41,
531
- "learning_rate": 0.0007886311626563576,
532
- "loss": 0.0087,
533
- "step": 58058
534
  },
535
  {
536
- "epoch": 17.71,
537
- "learning_rate": 0.0007830711635450687,
538
- "loss": 0.0089,
539
- "step": 59059
540
  },
541
  {
542
- "epoch": 18.0,
543
- "eval_loss": 0.027119183912873268,
544
- "eval_max_distance": 13,
545
  "eval_mean_distance": 0,
546
- "eval_runtime": 0.2424,
547
- "eval_samples_per_second": 206.232,
548
- "eval_steps_per_second": 8.249,
549
- "step": 60012
550
  },
551
  {
552
- "epoch": 18.01,
553
- "learning_rate": 0.0007775111644337799,
554
- "loss": 0.0091,
555
- "step": 60060
556
- },
557
- {
558
- "epoch": 18.31,
559
- "learning_rate": 0.0007719511653224912,
560
- "loss": 0.0075,
561
- "step": 61061
562
- },
563
- {
564
- "epoch": 18.61,
565
- "learning_rate": 0.0007663911662112022,
566
- "loss": 0.0079,
567
- "step": 62062
568
  },
569
  {
570
- "epoch": 18.92,
571
- "learning_rate": 0.0007608311670999134,
572
- "loss": 0.0085,
573
- "step": 63063
574
  },
575
  {
576
- "epoch": 19.0,
577
- "eval_loss": 0.024822326377034187,
578
- "eval_max_distance": 13,
579
- "eval_mean_distance": 0,
580
- "eval_runtime": 0.2416,
581
- "eval_samples_per_second": 206.915,
582
- "eval_steps_per_second": 8.277,
583
- "step": 63346
584
  },
585
  {
586
- "epoch": 19.22,
587
- "learning_rate": 0.0007552711679886245,
588
- "loss": 0.0071,
589
- "step": 64064
590
  },
591
  {
592
- "epoch": 19.52,
593
- "learning_rate": 0.0007497111688773357,
594
- "loss": 0.0074,
595
- "step": 65065
596
  },
597
  {
598
- "epoch": 19.82,
599
- "learning_rate": 0.0007441511697660468,
600
- "loss": 0.0074,
601
- "step": 66066
602
  },
603
  {
604
- "epoch": 20.0,
605
- "eval_loss": 0.027729548513889313,
606
- "eval_max_distance": 12,
607
- "eval_mean_distance": 0,
608
- "eval_runtime": 0.2481,
609
- "eval_samples_per_second": 201.568,
610
- "eval_steps_per_second": 8.063,
611
- "step": 66680
612
  },
613
  {
614
- "epoch": 20.12,
615
- "learning_rate": 0.000738591170654758,
616
- "loss": 0.007,
617
- "step": 67067
618
  },
619
  {
620
- "epoch": 20.42,
621
- "learning_rate": 0.0007330311715434691,
622
- "loss": 0.0061,
623
- "step": 68068
624
  },
625
  {
626
- "epoch": 20.72,
627
- "learning_rate": 0.0007274711724321802,
628
- "loss": 0.007,
629
- "step": 69069
630
  },
631
  {
632
- "epoch": 21.0,
633
- "eval_loss": 0.030854225158691406,
634
- "eval_max_distance": 13,
635
  "eval_mean_distance": 0,
636
- "eval_runtime": 0.2457,
637
- "eval_samples_per_second": 203.54,
638
- "eval_steps_per_second": 8.142,
639
- "step": 70014
640
- },
641
- {
642
- "epoch": 21.02,
643
- "learning_rate": 0.0007219111733208914,
644
- "loss": 0.0069,
645
- "step": 70070
646
  },
647
  {
648
- "epoch": 21.32,
649
- "learning_rate": 0.0007163511742096025,
650
- "loss": 0.006,
651
- "step": 71071
652
- },
653
- {
654
- "epoch": 21.62,
655
- "learning_rate": 0.0007107911750983137,
656
- "loss": 0.0061,
657
- "step": 72072
658
  },
659
  {
660
- "epoch": 21.92,
661
- "learning_rate": 0.0007052311759870248,
662
- "loss": 0.0066,
663
- "step": 73073
664
  },
665
  {
666
- "epoch": 22.0,
667
- "eval_loss": 0.030563361942768097,
668
- "eval_max_distance": 11,
669
- "eval_mean_distance": 0,
670
- "eval_runtime": 0.2419,
671
- "eval_samples_per_second": 206.734,
672
- "eval_steps_per_second": 8.269,
673
- "step": 73348
674
  },
675
  {
676
- "epoch": 22.22,
677
- "learning_rate": 0.000699671176875736,
678
- "loss": 0.0054,
679
- "step": 74074
680
  },
681
  {
682
- "epoch": 22.52,
683
- "learning_rate": 0.0006941111777644471,
684
- "loss": 0.0061,
685
- "step": 75075
686
  },
687
  {
688
- "epoch": 22.82,
689
- "learning_rate": 0.0006885511786531583,
690
- "loss": 0.0056,
691
- "step": 76076
692
  },
693
  {
694
- "epoch": 23.0,
695
- "eval_loss": 0.028730520978569984,
696
- "eval_max_distance": 10,
697
- "eval_mean_distance": 0,
698
- "eval_runtime": 0.2431,
699
- "eval_samples_per_second": 205.684,
700
- "eval_steps_per_second": 8.227,
701
- "step": 76682
702
  },
703
  {
704
- "epoch": 23.12,
705
- "learning_rate": 0.0006829911795418694,
706
- "loss": 0.0054,
707
- "step": 77077
708
  },
709
  {
710
- "epoch": 23.42,
711
- "learning_rate": 0.0006774311804305806,
712
- "loss": 0.0052,
713
- "step": 78078
714
  },
715
  {
716
- "epoch": 23.72,
717
- "learning_rate": 0.0006718711813192917,
718
- "loss": 0.0053,
719
- "step": 79079
720
  },
721
  {
722
- "epoch": 24.0,
723
- "eval_loss": 0.031197942793369293,
724
- "eval_max_distance": 12,
725
  "eval_mean_distance": 0,
726
- "eval_runtime": 0.2517,
727
- "eval_samples_per_second": 198.643,
728
- "eval_steps_per_second": 7.946,
729
- "step": 80016
730
- },
731
- {
732
- "epoch": 24.02,
733
- "learning_rate": 0.0006663111822080029,
734
- "loss": 0.0054,
735
- "step": 80080
736
  },
737
  {
738
- "epoch": 24.32,
739
- "learning_rate": 0.000660751183096714,
740
- "loss": 0.0044,
741
- "step": 81081
742
- },
743
- {
744
- "epoch": 24.62,
745
- "learning_rate": 0.000655191183985425,
746
- "loss": 0.0048,
747
- "step": 82082
748
  },
749
  {
750
- "epoch": 24.92,
751
- "learning_rate": 0.0006496311848741363,
752
- "loss": 0.0049,
753
- "step": 83083
754
  },
755
  {
756
- "epoch": 25.0,
757
- "eval_loss": 0.0276066605001688,
758
- "eval_max_distance": 11,
759
- "eval_mean_distance": 0,
760
- "eval_runtime": 0.2475,
761
- "eval_samples_per_second": 202.046,
762
- "eval_steps_per_second": 8.082,
763
- "step": 83350
764
  },
765
  {
766
- "epoch": 25.22,
767
- "learning_rate": 0.0006440711857628475,
768
- "loss": 0.0045,
769
- "step": 84084
770
  },
771
  {
772
- "epoch": 25.52,
773
- "learning_rate": 0.0006385111866515586,
774
- "loss": 0.0045,
775
- "step": 85085
776
  },
777
  {
778
- "epoch": 25.82,
779
- "learning_rate": 0.0006329511875402698,
780
- "loss": 0.0053,
781
- "step": 86086
782
  },
783
  {
784
- "epoch": 26.0,
785
- "eval_loss": 0.030818996950984,
786
- "eval_max_distance": 10,
787
- "eval_mean_distance": 0,
788
- "eval_runtime": 0.2424,
789
- "eval_samples_per_second": 206.301,
790
- "eval_steps_per_second": 8.252,
791
- "step": 86684
792
  },
793
  {
794
- "epoch": 26.12,
795
- "learning_rate": 0.0006273911884289809,
796
- "loss": 0.0045,
797
- "step": 87087
798
  },
799
  {
800
- "epoch": 26.42,
801
- "learning_rate": 0.000621831189317692,
802
- "loss": 0.0041,
803
- "step": 88088
804
  },
805
  {
806
- "epoch": 26.72,
807
- "learning_rate": 0.0006162711902064032,
808
- "loss": 0.0041,
809
- "step": 89089
810
  },
811
  {
812
- "epoch": 27.0,
813
- "eval_loss": 0.027929000556468964,
814
- "eval_max_distance": 10,
815
  "eval_mean_distance": 0,
816
- "eval_runtime": 0.2471,
817
- "eval_samples_per_second": 202.312,
818
- "eval_steps_per_second": 8.092,
819
- "step": 90018
820
- },
821
- {
822
- "epoch": 27.02,
823
- "learning_rate": 0.0006107111910951143,
824
- "loss": 0.0043,
825
- "step": 90090
826
  },
827
  {
828
- "epoch": 27.32,
829
- "learning_rate": 0.0006051511919838255,
830
- "loss": 0.0038,
831
- "step": 91091
832
- },
833
- {
834
- "epoch": 27.62,
835
- "learning_rate": 0.0005995911928725366,
836
- "loss": 0.0038,
837
- "step": 92092
838
  },
839
  {
840
- "epoch": 27.92,
841
- "learning_rate": 0.0005940311937612478,
842
- "loss": 0.0041,
843
- "step": 93093
844
  },
845
  {
846
- "epoch": 28.0,
847
- "eval_loss": 0.029230400919914246,
848
- "eval_max_distance": 11,
849
- "eval_mean_distance": 0,
850
- "eval_runtime": 0.2482,
851
- "eval_samples_per_second": 201.481,
852
- "eval_steps_per_second": 8.059,
853
- "step": 93352
854
  },
855
  {
856
- "epoch": 28.22,
857
- "learning_rate": 0.0005884711946499589,
858
- "loss": 0.0037,
859
- "step": 94094
860
  },
861
  {
862
- "epoch": 28.52,
863
- "learning_rate": 0.00058291119553867,
864
- "loss": 0.0033,
865
- "step": 95095
866
  },
867
  {
868
- "epoch": 28.82,
869
- "learning_rate": 0.0005773511964273812,
870
- "loss": 0.0037,
871
- "step": 96096
872
  },
873
  {
874
- "epoch": 29.0,
875
- "eval_loss": 0.030607566237449646,
876
- "eval_max_distance": 11,
877
- "eval_mean_distance": 0,
878
- "eval_runtime": 0.2429,
879
- "eval_samples_per_second": 205.838,
880
- "eval_steps_per_second": 8.234,
881
- "step": 96686
882
  },
883
  {
884
- "epoch": 29.12,
885
- "learning_rate": 0.0005717911973160923,
886
- "loss": 0.0036,
887
- "step": 97097
888
  },
889
  {
890
- "epoch": 29.42,
891
- "learning_rate": 0.0005662311982048035,
892
- "loss": 0.0033,
893
- "step": 98098
894
  },
895
  {
896
- "epoch": 29.72,
897
- "learning_rate": 0.0005606711990935146,
898
- "loss": 0.0035,
899
- "step": 99099
900
  },
901
  {
902
- "epoch": 30.0,
903
- "eval_loss": 0.027241094037890434,
904
- "eval_max_distance": 12,
905
  "eval_mean_distance": 0,
906
- "eval_runtime": 0.2466,
907
- "eval_samples_per_second": 202.757,
908
- "eval_steps_per_second": 8.11,
909
- "step": 100020
910
- },
911
- {
912
- "epoch": 30.02,
913
- "learning_rate": 0.0005551111999822258,
914
- "loss": 0.0033,
915
- "step": 100100
916
  },
917
  {
918
- "epoch": 30.32,
919
- "learning_rate": 0.0005495512008709369,
920
- "loss": 0.003,
921
- "step": 101101
922
- },
923
- {
924
- "epoch": 30.62,
925
- "learning_rate": 0.0005439912017596481,
926
- "loss": 0.0031,
927
- "step": 102102
928
  },
929
  {
930
- "epoch": 30.92,
931
- "learning_rate": 0.0005384312026483592,
932
- "loss": 0.0032,
933
- "step": 103103
934
  },
935
  {
936
- "epoch": 31.0,
937
- "eval_loss": 0.0254651065915823,
938
- "eval_max_distance": 9,
939
- "eval_mean_distance": 0,
940
- "eval_runtime": 0.2446,
941
- "eval_samples_per_second": 204.388,
942
- "eval_steps_per_second": 8.176,
943
- "step": 103354
944
  },
945
  {
946
- "epoch": 31.22,
947
- "learning_rate": 0.0005328712035370704,
948
- "loss": 0.0028,
949
- "step": 104104
950
  },
951
  {
952
- "epoch": 31.53,
953
- "learning_rate": 0.0005273112044257815,
954
- "loss": 0.0029,
955
- "step": 105105
956
  },
957
  {
958
- "epoch": 31.83,
959
- "learning_rate": 0.0005217512053144927,
960
- "loss": 0.0031,
961
- "step": 106106
962
  },
963
  {
964
- "epoch": 32.0,
965
- "eval_loss": 0.02928677573800087,
966
- "eval_max_distance": 10,
967
- "eval_mean_distance": 0,
968
- "eval_runtime": 0.2518,
969
- "eval_samples_per_second": 198.594,
970
- "eval_steps_per_second": 7.944,
971
- "step": 106688
972
  },
973
  {
974
- "epoch": 32.13,
975
- "learning_rate": 0.0005161912062032039,
976
- "loss": 0.0028,
977
- "step": 107107
978
  },
979
  {
980
- "epoch": 32.43,
981
- "learning_rate": 0.0005106312070919149,
982
- "loss": 0.0026,
983
- "step": 108108
984
  },
985
  {
986
- "epoch": 32.73,
987
- "learning_rate": 0.0005050712079806262,
988
- "loss": 0.0029,
989
- "step": 109109
990
  },
991
  {
992
- "epoch": 33.0,
993
- "eval_loss": 0.029988963156938553,
994
- "eval_max_distance": 13,
995
  "eval_mean_distance": 0,
996
- "eval_runtime": 0.2465,
997
- "eval_samples_per_second": 202.802,
998
- "eval_steps_per_second": 8.112,
999
- "step": 110022
1000
- },
1001
- {
1002
- "epoch": 33.03,
1003
- "learning_rate": 0.0004995112088693373,
1004
- "loss": 0.0027,
1005
- "step": 110110
1006
  },
1007
  {
1008
- "epoch": 33.33,
1009
- "learning_rate": 0.0004939512097580485,
1010
- "loss": 0.0025,
1011
- "step": 111111
1012
- },
1013
- {
1014
- "epoch": 33.63,
1015
- "learning_rate": 0.0004883912106467596,
1016
- "loss": 0.0026,
1017
- "step": 112112
1018
  },
1019
  {
1020
- "epoch": 33.93,
1021
- "learning_rate": 0.0004828312115354707,
1022
- "loss": 0.0026,
1023
- "step": 113113
1024
  },
1025
  {
1026
- "epoch": 34.0,
1027
- "eval_loss": 0.03050011210143566,
1028
- "eval_max_distance": 11,
1029
- "eval_mean_distance": 0,
1030
- "eval_runtime": 0.2507,
1031
- "eval_samples_per_second": 199.458,
1032
- "eval_steps_per_second": 7.978,
1033
- "step": 113356
1034
  },
1035
  {
1036
- "epoch": 34.23,
1037
- "learning_rate": 0.00047727121242418185,
1038
- "loss": 0.0025,
1039
- "step": 114114
1040
  },
1041
  {
1042
- "epoch": 34.53,
1043
- "learning_rate": 0.00047171121331289294,
1044
- "loss": 0.0023,
1045
- "step": 115115
1046
  },
1047
  {
1048
- "epoch": 34.83,
1049
- "learning_rate": 0.0004661512142016041,
1050
- "loss": 0.0024,
1051
- "step": 116116
1052
  },
1053
  {
1054
- "epoch": 35.0,
1055
- "eval_loss": 0.027280788868665695,
1056
- "eval_max_distance": 9,
1057
- "eval_mean_distance": 0,
1058
- "eval_runtime": 0.2447,
1059
- "eval_samples_per_second": 204.372,
1060
- "eval_steps_per_second": 8.175,
1061
- "step": 116690
1062
  },
1063
  {
1064
- "epoch": 35.13,
1065
- "learning_rate": 0.00046059121509031524,
1066
- "loss": 0.0024,
1067
- "step": 117117
1068
  },
1069
  {
1070
- "epoch": 35.43,
1071
- "learning_rate": 0.00045503121597902644,
1072
- "loss": 0.0022,
1073
- "step": 118118
1074
  },
1075
  {
1076
- "epoch": 35.73,
1077
- "learning_rate": 0.0004494712168677376,
1078
- "loss": 0.0023,
1079
- "step": 119119
1080
  },
1081
  {
1082
- "epoch": 36.0,
1083
- "eval_loss": 0.028403306379914284,
1084
- "eval_max_distance": 10,
1085
  "eval_mean_distance": 0,
1086
- "eval_runtime": 0.2435,
1087
- "eval_samples_per_second": 205.364,
1088
- "eval_steps_per_second": 8.215,
1089
- "step": 120024
1090
- },
1091
- {
1092
- "epoch": 36.03,
1093
- "learning_rate": 0.00044391121775644874,
1094
- "loss": 0.0022,
1095
- "step": 120120
1096
  },
1097
  {
1098
- "epoch": 36.33,
1099
- "learning_rate": 0.0004383512186451599,
1100
- "loss": 0.002,
1101
- "step": 121121
1102
- },
1103
- {
1104
- "epoch": 36.63,
1105
- "learning_rate": 0.00043279121953387103,
1106
- "loss": 0.0022,
1107
- "step": 122122
1108
  },
1109
  {
1110
- "epoch": 36.93,
1111
- "learning_rate": 0.0004272312204225822,
1112
- "loss": 0.0022,
1113
- "step": 123123
1114
  },
1115
  {
1116
- "epoch": 37.0,
1117
- "eval_loss": 0.03133893013000488,
1118
- "eval_max_distance": 13,
1119
- "eval_mean_distance": 0,
1120
- "eval_runtime": 0.2436,
1121
- "eval_samples_per_second": 205.289,
1122
- "eval_steps_per_second": 8.212,
1123
- "step": 123358
1124
  },
1125
  {
1126
- "epoch": 37.23,
1127
- "learning_rate": 0.00042167122131129333,
1128
- "loss": 0.0019,
1129
- "step": 124124
1130
  },
1131
  {
1132
- "epoch": 37.53,
1133
- "learning_rate": 0.0004161112222000045,
1134
- "loss": 0.0019,
1135
- "step": 125125
1136
  },
1137
  {
1138
- "epoch": 37.83,
1139
- "learning_rate": 0.0004105512230887156,
1140
- "loss": 0.002,
1141
- "step": 126126
1142
  },
1143
  {
1144
- "epoch": 38.0,
1145
- "eval_loss": 0.034086938947439194,
1146
- "eval_max_distance": 13,
1147
- "eval_mean_distance": 0,
1148
- "eval_runtime": 0.242,
1149
- "eval_samples_per_second": 206.579,
1150
- "eval_steps_per_second": 8.263,
1151
- "step": 126692
1152
  },
1153
  {
1154
- "epoch": 38.13,
1155
- "learning_rate": 0.0004049912239774268,
1156
- "loss": 0.002,
1157
- "step": 127127
1158
  },
1159
  {
1160
- "epoch": 38.43,
1161
- "learning_rate": 0.00039943122486613787,
1162
- "loss": 0.0018,
1163
- "step": 128128
1164
  },
1165
  {
1166
- "epoch": 38.73,
1167
- "learning_rate": 0.000393871225754849,
1168
- "loss": 0.0017,
1169
- "step": 129129
1170
  },
1171
  {
1172
- "epoch": 39.0,
1173
- "eval_loss": 0.03005034476518631,
1174
- "eval_max_distance": 13,
1175
  "eval_mean_distance": 0,
1176
- "eval_runtime": 0.2407,
1177
- "eval_samples_per_second": 207.711,
1178
- "eval_steps_per_second": 8.308,
1179
- "step": 130026
1180
- },
1181
- {
1182
- "epoch": 39.03,
1183
- "learning_rate": 0.00038831122664356016,
1184
- "loss": 0.0018,
1185
- "step": 130130
1186
- },
1187
- {
1188
- "epoch": 39.33,
1189
- "learning_rate": 0.0003827512275322713,
1190
- "loss": 0.0016,
1191
- "step": 131131
1192
  },
1193
  {
1194
- "epoch": 39.63,
1195
- "learning_rate": 0.00037719122842098246,
1196
- "loss": 0.0017,
1197
- "step": 132132
1198
  },
1199
  {
1200
- "epoch": 39.93,
1201
- "learning_rate": 0.0003716312293096936,
1202
- "loss": 0.0017,
1203
- "step": 133133
1204
  },
1205
  {
1206
- "epoch": 40.0,
1207
- "eval_loss": 0.03297489508986473,
1208
- "eval_max_distance": 11,
1209
- "eval_mean_distance": 0,
1210
- "eval_runtime": 0.2478,
1211
- "eval_samples_per_second": 201.796,
1212
- "eval_steps_per_second": 8.072,
1213
- "step": 133360
1214
  },
1215
  {
1216
- "epoch": 40.23,
1217
- "learning_rate": 0.00036607123019840476,
1218
- "loss": 0.0015,
1219
- "step": 134134
1220
  },
1221
  {
1222
- "epoch": 40.53,
1223
- "learning_rate": 0.0003605112310871159,
1224
- "loss": 0.0015,
1225
- "step": 135135
1226
  },
1227
  {
1228
- "epoch": 40.83,
1229
- "learning_rate": 0.00035495123197582705,
1230
- "loss": 0.0016,
1231
- "step": 136136
1232
  },
1233
  {
1234
- "epoch": 41.0,
1235
- "eval_loss": 0.03444751352071762,
1236
- "eval_max_distance": 11,
1237
- "eval_mean_distance": 0,
1238
- "eval_runtime": 0.2543,
1239
- "eval_samples_per_second": 196.583,
1240
- "eval_steps_per_second": 7.863,
1241
- "step": 136694
1242
  },
1243
  {
1244
- "epoch": 41.13,
1245
- "learning_rate": 0.0003493912328645382,
1246
- "loss": 0.0015,
1247
- "step": 137137
1248
  },
1249
  {
1250
- "epoch": 41.43,
1251
- "learning_rate": 0.00034383123375324935,
1252
- "loss": 0.0014,
1253
- "step": 138138
1254
  },
1255
  {
1256
- "epoch": 41.73,
1257
- "learning_rate": 0.0003382712346419605,
1258
- "loss": 0.0014,
1259
- "step": 139139
1260
  },
1261
  {
1262
- "epoch": 42.0,
1263
- "eval_loss": 0.033661480993032455,
1264
- "eval_max_distance": 10,
1265
  "eval_mean_distance": 0,
1266
- "eval_runtime": 0.251,
1267
- "eval_samples_per_second": 199.199,
1268
- "eval_steps_per_second": 7.968,
1269
- "step": 140028
1270
- },
1271
- {
1272
- "epoch": 42.03,
1273
- "learning_rate": 0.0003327112355306717,
1274
- "loss": 0.0015,
1275
- "step": 140140
1276
- },
1277
- {
1278
- "epoch": 42.33,
1279
- "learning_rate": 0.0003271512364193828,
1280
- "loss": 0.0014,
1281
- "step": 141141
1282
  },
1283
  {
1284
- "epoch": 42.63,
1285
- "learning_rate": 0.00032159123730809394,
1286
- "loss": 0.0014,
1287
- "step": 142142
1288
  },
1289
  {
1290
- "epoch": 42.93,
1291
- "learning_rate": 0.0003160312381968051,
1292
- "loss": 0.0013,
1293
- "step": 143143
1294
  },
1295
  {
1296
- "epoch": 43.0,
1297
- "eval_loss": 0.029230637475848198,
1298
- "eval_max_distance": 12,
1299
- "eval_mean_distance": 0,
1300
- "eval_runtime": 0.2458,
1301
- "eval_samples_per_second": 203.394,
1302
- "eval_steps_per_second": 8.136,
1303
- "step": 143362
1304
  },
1305
  {
1306
- "epoch": 43.23,
1307
- "learning_rate": 0.00031047123908551624,
1308
- "loss": 0.0012,
1309
- "step": 144144
1310
  },
1311
  {
1312
- "epoch": 43.53,
1313
- "learning_rate": 0.0003049112399742274,
1314
- "loss": 0.0012,
1315
- "step": 145145
1316
  },
1317
  {
1318
- "epoch": 43.84,
1319
- "learning_rate": 0.00029935124086293854,
1320
- "loss": 0.0012,
1321
- "step": 146146
1322
  },
1323
  {
1324
- "epoch": 44.0,
1325
- "eval_loss": 0.03386835753917694,
1326
- "eval_max_distance": 11,
1327
- "eval_mean_distance": 0,
1328
- "eval_runtime": 0.248,
1329
- "eval_samples_per_second": 201.602,
1330
- "eval_steps_per_second": 8.064,
1331
- "step": 146696
1332
  },
1333
  {
1334
- "epoch": 44.14,
1335
- "learning_rate": 0.0002937912417516497,
1336
- "loss": 0.0012,
1337
- "step": 147147
1338
  },
1339
  {
1340
- "epoch": 44.44,
1341
- "learning_rate": 0.00028823124264036083,
1342
- "loss": 0.0011,
1343
- "step": 148148
1344
  },
1345
  {
1346
- "epoch": 44.74,
1347
- "learning_rate": 0.000282671243529072,
1348
- "loss": 0.0012,
1349
- "step": 149149
1350
  },
1351
  {
1352
- "epoch": 45.0,
1353
- "eval_loss": 0.03299795091152191,
1354
- "eval_max_distance": 11,
1355
  "eval_mean_distance": 0,
1356
- "eval_runtime": 0.2516,
1357
- "eval_samples_per_second": 198.692,
1358
- "eval_steps_per_second": 7.948,
1359
- "step": 150030
1360
- },
1361
- {
1362
- "epoch": 45.04,
1363
- "learning_rate": 0.00027711124441778313,
1364
- "loss": 0.0012,
1365
- "step": 150150
1366
- },
1367
- {
1368
- "epoch": 45.34,
1369
- "learning_rate": 0.0002715512453064943,
1370
- "loss": 0.001,
1371
- "step": 151151
1372
  },
1373
  {
1374
- "epoch": 45.64,
1375
- "learning_rate": 0.0002659912461952054,
1376
- "loss": 0.0011,
1377
- "step": 152152
1378
  },
1379
  {
1380
- "epoch": 45.94,
1381
- "learning_rate": 0.0002604312470839166,
1382
- "loss": 0.001,
1383
- "step": 153153
1384
  },
1385
  {
1386
- "epoch": 46.0,
1387
- "eval_loss": 0.030699940398335457,
1388
- "eval_max_distance": 11,
1389
- "eval_mean_distance": 0,
1390
- "eval_runtime": 0.2486,
1391
- "eval_samples_per_second": 201.091,
1392
- "eval_steps_per_second": 8.044,
1393
- "step": 153364
1394
  },
1395
  {
1396
- "epoch": 46.24,
1397
- "learning_rate": 0.0002548712479726277,
1398
- "loss": 0.001,
1399
- "step": 154154
1400
  },
1401
  {
1402
- "epoch": 46.54,
1403
- "learning_rate": 0.00024931124886133887,
1404
- "loss": 0.0009,
1405
- "step": 155155
1406
  },
1407
  {
1408
- "epoch": 46.84,
1409
- "learning_rate": 0.00024375124975005,
1410
- "loss": 0.001,
1411
- "step": 156156
1412
  },
1413
  {
1414
- "epoch": 47.0,
1415
- "eval_loss": 0.032952647656202316,
1416
- "eval_max_distance": 10,
1417
- "eval_mean_distance": 0,
1418
- "eval_runtime": 0.2471,
1419
- "eval_samples_per_second": 202.373,
1420
- "eval_steps_per_second": 8.095,
1421
- "step": 156698
1422
  },
1423
  {
1424
- "epoch": 47.14,
1425
- "learning_rate": 0.00023819125063876117,
1426
- "loss": 0.0013,
1427
- "step": 157157
1428
  },
1429
  {
1430
- "epoch": 47.44,
1431
- "learning_rate": 0.0002326312515274723,
1432
- "loss": 0.0009,
1433
- "step": 158158
1434
  },
1435
  {
1436
- "epoch": 47.74,
1437
- "learning_rate": 0.00022707125241618344,
1438
- "loss": 0.0009,
1439
- "step": 159159
1440
  },
1441
  {
1442
- "epoch": 48.0,
1443
- "eval_loss": 0.03382818400859833,
1444
- "eval_max_distance": 11,
1445
  "eval_mean_distance": 0,
1446
- "eval_runtime": 0.2551,
1447
- "eval_samples_per_second": 195.998,
1448
- "eval_steps_per_second": 7.84,
1449
- "step": 160032
1450
  },
1451
  {
1452
- "epoch": 48.04,
1453
- "learning_rate": 0.00022151125330489458,
1454
- "loss": 0.0009,
1455
- "step": 160160
1456
- },
1457
- {
1458
- "epoch": 48.34,
1459
- "learning_rate": 0.00021595125419360573,
1460
- "loss": 0.0008,
1461
- "step": 161161
1462
- },
1463
- {
1464
- "epoch": 48.64,
1465
- "learning_rate": 0.00021039125508231688,
1466
- "loss": 0.0009,
1467
- "step": 162162
1468
  },
1469
  {
1470
- "epoch": 48.94,
1471
- "learning_rate": 0.00020483125597102803,
1472
- "loss": 0.0009,
1473
- "step": 163163
1474
  },
1475
  {
1476
- "epoch": 49.0,
1477
- "eval_loss": 0.02877364680171013,
1478
- "eval_max_distance": 10,
1479
- "eval_mean_distance": 0,
1480
- "eval_runtime": 0.2518,
1481
- "eval_samples_per_second": 198.574,
1482
- "eval_steps_per_second": 7.943,
1483
- "step": 163366
1484
  },
1485
  {
1486
- "epoch": 49.24,
1487
- "learning_rate": 0.00019927125685973918,
1488
- "loss": 0.0008,
1489
- "step": 164164
1490
  },
1491
  {
1492
- "epoch": 49.54,
1493
- "learning_rate": 0.0001937112577484503,
1494
- "loss": 0.0008,
1495
- "step": 165165
1496
  },
1497
  {
1498
- "epoch": 49.84,
1499
- "learning_rate": 0.00018815125863716145,
1500
- "loss": 0.0008,
1501
- "step": 166166
1502
  },
1503
  {
1504
- "epoch": 50.0,
1505
- "eval_loss": 0.02558927983045578,
1506
- "eval_max_distance": 10,
1507
- "eval_mean_distance": 0,
1508
- "eval_runtime": 0.2461,
1509
- "eval_samples_per_second": 203.155,
1510
- "eval_steps_per_second": 8.126,
1511
- "step": 166700
1512
  },
1513
  {
1514
- "epoch": 50.14,
1515
- "learning_rate": 0.0001825912595258726,
1516
- "loss": 0.0007,
1517
- "step": 167167
1518
  },
1519
  {
1520
- "epoch": 50.44,
1521
- "learning_rate": 0.00017703126041458374,
1522
- "loss": 0.0007,
1523
- "step": 168168
1524
  },
1525
  {
1526
- "epoch": 50.74,
1527
- "learning_rate": 0.00017147126130329492,
1528
- "loss": 0.0007,
1529
- "step": 169169
1530
  },
1531
  {
1532
- "epoch": 51.0,
1533
- "eval_loss": 0.02841602824628353,
1534
- "eval_max_distance": 11,
1535
  "eval_mean_distance": 0,
1536
- "eval_runtime": 0.2394,
1537
- "eval_samples_per_second": 208.815,
1538
- "eval_steps_per_second": 8.353,
1539
- "step": 170034
1540
- },
1541
- {
1542
- "epoch": 51.04,
1543
- "learning_rate": 0.00016591126219200607,
1544
- "loss": 0.0007,
1545
- "step": 170170
1546
  },
1547
  {
1548
- "epoch": 51.34,
1549
- "learning_rate": 0.0001603512630807172,
1550
- "loss": 0.0007,
1551
- "step": 171171
1552
- },
1553
- {
1554
- "epoch": 51.64,
1555
- "learning_rate": 0.00015479126396942834,
1556
- "loss": 0.0006,
1557
- "step": 172172
1558
  },
1559
  {
1560
- "epoch": 51.94,
1561
- "learning_rate": 0.00014923126485813948,
1562
- "loss": 0.0006,
1563
- "step": 173173
1564
  },
1565
  {
1566
- "epoch": 52.0,
1567
- "eval_loss": 0.03416401892900467,
1568
- "eval_max_distance": 10,
1569
- "eval_mean_distance": 0,
1570
- "eval_runtime": 0.2536,
1571
- "eval_samples_per_second": 197.147,
1572
- "eval_steps_per_second": 7.886,
1573
- "step": 173368
1574
  },
1575
  {
1576
- "epoch": 52.24,
1577
- "learning_rate": 0.00014367126574685063,
1578
- "loss": 0.0006,
1579
- "step": 174174
1580
  },
1581
  {
1582
- "epoch": 52.54,
1583
- "learning_rate": 0.00013811126663556178,
1584
- "loss": 0.0006,
1585
- "step": 175175
1586
  },
1587
  {
1588
- "epoch": 52.84,
1589
- "learning_rate": 0.00013255126752427293,
1590
- "loss": 0.0006,
1591
- "step": 176176
1592
  },
1593
  {
1594
- "epoch": 53.0,
1595
- "eval_loss": 0.031156664714217186,
1596
- "eval_max_distance": 10,
1597
- "eval_mean_distance": 0,
1598
- "eval_runtime": 0.2541,
1599
- "eval_samples_per_second": 196.804,
1600
- "eval_steps_per_second": 7.872,
1601
- "step": 176702
1602
  },
1603
  {
1604
- "epoch": 53.14,
1605
- "learning_rate": 0.00012699126841298408,
1606
- "loss": 0.0006,
1607
- "step": 177177
1608
  },
1609
  {
1610
- "epoch": 53.44,
1611
- "learning_rate": 0.00012143126930169523,
1612
- "loss": 0.0005,
1613
- "step": 178178
1614
  },
1615
  {
1616
- "epoch": 53.74,
1617
- "learning_rate": 0.00011587127019040637,
1618
- "loss": 0.0005,
1619
- "step": 179179
1620
  },
1621
  {
1622
- "epoch": 54.0,
1623
- "eval_loss": 0.03255148231983185,
1624
- "eval_max_distance": 10,
1625
  "eval_mean_distance": 0,
1626
- "eval_runtime": 0.2469,
1627
- "eval_samples_per_second": 202.55,
1628
- "eval_steps_per_second": 8.102,
1629
- "step": 180036
1630
  },
1631
  {
1632
- "epoch": 54.04,
1633
- "learning_rate": 0.00011031127107911751,
1634
- "loss": 0.0005,
1635
- "step": 180180
1636
  },
1637
  {
1638
- "epoch": 54.34,
1639
- "learning_rate": 0.00010475127196782866,
1640
- "loss": 0.0006,
1641
- "step": 181181
1642
  },
1643
  {
1644
- "epoch": 54.64,
1645
- "learning_rate": 9.91912728565398e-05,
1646
- "loss": 0.0005,
1647
- "step": 182182
1648
  },
1649
  {
1650
- "epoch": 54.94,
1651
- "learning_rate": 9.363127374525095e-05,
1652
- "loss": 0.0005,
1653
- "step": 183183
1654
  },
1655
  {
1656
- "epoch": 55.0,
1657
- "eval_loss": 0.030407395213842392,
1658
- "eval_max_distance": 11,
1659
- "eval_mean_distance": 0,
1660
- "eval_runtime": 0.2417,
1661
- "eval_samples_per_second": 206.906,
1662
- "eval_steps_per_second": 8.276,
1663
- "step": 183370
1664
  },
1665
  {
1666
- "epoch": 55.24,
1667
- "learning_rate": 8.80712746339621e-05,
1668
- "loss": 0.0005,
1669
- "step": 184184
1670
  },
1671
  {
1672
- "epoch": 55.54,
1673
- "learning_rate": 8.251127552267325e-05,
1674
  "loss": 0.0004,
1675
- "step": 185185
1676
  },
1677
  {
1678
- "epoch": 55.84,
1679
- "learning_rate": 7.695127641138438e-05,
1680
  "loss": 0.0005,
1681
- "step": 186186
1682
- },
1683
- {
1684
- "epoch": 56.0,
1685
- "eval_loss": 0.02997196838259697,
1686
- "eval_max_distance": 11,
1687
- "eval_mean_distance": 0,
1688
- "eval_runtime": 0.2484,
1689
- "eval_samples_per_second": 201.291,
1690
- "eval_steps_per_second": 8.052,
1691
- "step": 186704
1692
- },
1693
- {
1694
- "epoch": 56.14,
1695
- "learning_rate": 7.139127730009553e-05,
1696
- "loss": 0.0004,
1697
- "step": 187187
1698
  },
1699
  {
1700
- "epoch": 56.45,
1701
- "learning_rate": 6.583127818880668e-05,
1702
  "loss": 0.0004,
1703
- "step": 188188
1704
  },
1705
  {
1706
- "epoch": 56.75,
1707
- "learning_rate": 6.027127907751783e-05,
1708
  "loss": 0.0004,
1709
- "step": 189189
1710
  },
1711
  {
1712
- "epoch": 57.0,
1713
- "eval_loss": 0.03127776086330414,
1714
- "eval_max_distance": 11,
1715
  "eval_mean_distance": 0,
1716
- "eval_runtime": 0.2542,
1717
- "eval_samples_per_second": 196.708,
1718
- "eval_steps_per_second": 7.868,
1719
- "step": 190038
1720
  },
1721
  {
1722
- "epoch": 57.05,
1723
- "learning_rate": 5.471127996622898e-05,
1724
  "loss": 0.0004,
1725
- "step": 190190
1726
  },
1727
  {
1728
- "epoch": 57.35,
1729
- "learning_rate": 4.9151280854940125e-05,
1730
  "loss": 0.0004,
1731
- "step": 191191
1732
  },
1733
  {
1734
- "epoch": 57.65,
1735
- "learning_rate": 4.359128174365127e-05,
1736
  "loss": 0.0004,
1737
- "step": 192192
1738
  },
1739
  {
1740
- "epoch": 57.95,
1741
- "learning_rate": 3.803128263236242e-05,
1742
  "loss": 0.0003,
1743
- "step": 193193
1744
  },
1745
  {
1746
- "epoch": 58.0,
1747
- "eval_loss": 0.03212800994515419,
1748
- "eval_max_distance": 11,
1749
- "eval_mean_distance": 0,
1750
- "eval_runtime": 0.236,
1751
- "eval_samples_per_second": 211.858,
1752
- "eval_steps_per_second": 8.474,
1753
- "step": 193372
1754
- },
1755
- {
1756
- "epoch": 58.25,
1757
- "learning_rate": 3.247128352107356e-05,
1758
- "loss": 0.0003,
1759
- "step": 194194
1760
  },
1761
  {
1762
- "epoch": 58.55,
1763
- "learning_rate": 2.691128440978471e-05,
1764
  "loss": 0.0004,
1765
- "step": 195195
1766
  },
1767
  {
1768
- "epoch": 58.85,
1769
- "learning_rate": 2.135128529849586e-05,
1770
- "loss": 0.0003,
1771
- "step": 196196
1772
  },
1773
  {
1774
- "epoch": 59.0,
1775
- "eval_loss": 0.031559597700834274,
1776
- "eval_max_distance": 10,
1777
- "eval_mean_distance": 0,
1778
- "eval_runtime": 0.2475,
1779
- "eval_samples_per_second": 201.99,
1780
- "eval_steps_per_second": 8.08,
1781
- "step": 196706
1782
  },
1783
  {
1784
- "epoch": 59.15,
1785
- "learning_rate": 1.5791286187207e-05,
1786
- "loss": 0.0003,
1787
- "step": 197197
1788
  },
1789
  {
1790
- "epoch": 59.45,
1791
- "learning_rate": 1.023128707591815e-05,
1792
- "loss": 0.0003,
1793
- "step": 198198
1794
  },
1795
  {
1796
- "epoch": 59.75,
1797
- "learning_rate": 4.671287964629296e-06,
1798
- "loss": 0.0004,
1799
- "step": 199199
 
 
 
 
1800
  },
1801
  {
1802
- "epoch": 60.0,
1803
- "eval_loss": 0.03177854046225548,
1804
- "eval_max_distance": 11,
1805
- "eval_mean_distance": 0,
1806
- "eval_runtime": 0.2438,
1807
- "eval_samples_per_second": 205.126,
1808
- "eval_steps_per_second": 8.205,
1809
- "step": 200040
1810
- },
1811
- {
1812
- "epoch": 60.0,
1813
- "step": 200040,
1814
- "total_flos": 1.1617191885791232e+17,
1815
- "train_loss": 0.03170474885008116,
1816
- "train_runtime": 15592.8332,
1817
- "train_samples_per_second": 384.846,
1818
- "train_steps_per_second": 12.829
1819
  }
1820
  ],
1821
- "logging_steps": 1001,
1822
- "max_steps": 200040,
1823
- "num_train_epochs": 60,
1824
- "save_steps": 2001,
1825
- "total_flos": 1.1617191885791232e+17,
1826
  "trial_name": null,
1827
  "trial_params": null
1828
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
  "eval_steps": 500,
6
+ "global_step": 68860,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 1.4522218995062446e-08,
14
+ "loss": 0.0,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.1,
19
+ "learning_rate": 5.010165553296544e-06,
20
+ "loss": 0.0114,
21
+ "step": 345
 
 
 
 
 
 
22
  },
23
  {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.0020331106593089e-05,
26
+ "loss": 0.0115,
27
+ "step": 690
 
 
 
 
 
 
 
 
 
 
28
  },
29
  {
30
+ "epoch": 0.3,
31
+ "learning_rate": 1.5030496659889632e-05,
32
+ "loss": 0.0099,
33
+ "step": 1035
 
 
 
 
 
 
 
 
 
 
 
 
34
  },
35
  {
36
+ "epoch": 0.4,
37
+ "learning_rate": 2.0040662213186177e-05,
38
+ "loss": 0.0076,
39
+ "step": 1380
 
 
 
 
40
  },
41
  {
42
+ "epoch": 0.5,
43
+ "learning_rate": 2.505082776648272e-05,
44
+ "loss": 0.0047,
45
+ "step": 1725
46
  },
47
  {
48
+ "epoch": 0.6,
49
+ "learning_rate": 3.0060993319779264e-05,
50
+ "loss": 0.0039,
51
+ "step": 2070
52
  },
53
  {
54
+ "epoch": 0.7,
55
+ "learning_rate": 3.507115887307581e-05,
56
+ "loss": 0.0036,
57
+ "step": 2415
58
  },
59
  {
60
+ "epoch": 0.8,
61
+ "learning_rate": 4.0081324426372355e-05,
62
+ "loss": 0.0031,
63
+ "step": 2760
 
 
 
 
64
  },
65
  {
66
+ "epoch": 0.9,
67
+ "learning_rate": 4.509148997966889e-05,
68
+ "loss": 0.0028,
69
+ "step": 3105
70
  },
71
  {
72
+ "epoch": 1.0,
73
+ "eval_loss": 0.001530332607217133,
74
+ "eval_max_distance": 3,
75
+ "eval_mean_distance": 0,
76
+ "eval_runtime": 2.4239,
77
+ "eval_samples_per_second": 20.628,
78
+ "eval_steps_per_second": 0.825,
79
+ "step": 3443
80
  },
81
  {
82
+ "epoch": 1.0,
83
+ "learning_rate": 5.010165553296544e-05,
84
+ "loss": 0.0025,
85
+ "step": 3450
86
  },
87
  {
88
+ "epoch": 1.1,
89
+ "learning_rate": 5.5111821086261985e-05,
90
+ "loss": 0.0022,
91
+ "step": 3795
92
  },
93
  {
94
+ "epoch": 1.2,
95
+ "learning_rate": 6.012198663955853e-05,
96
+ "loss": 0.002,
97
+ "step": 4140
 
 
 
 
98
  },
99
  {
100
+ "epoch": 1.3,
101
+ "learning_rate": 6.513215219285507e-05,
102
+ "loss": 0.0022,
103
+ "step": 4485
104
  },
105
  {
106
+ "epoch": 1.4,
107
+ "learning_rate": 7.014231774615162e-05,
108
+ "loss": 0.0022,
109
+ "step": 4830
110
  },
111
  {
112
+ "epoch": 1.5,
113
+ "learning_rate": 7.515248329944817e-05,
114
+ "loss": 0.002,
115
+ "step": 5175
116
  },
117
  {
118
+ "epoch": 1.6,
119
+ "learning_rate": 8.016264885274471e-05,
120
+ "loss": 0.0021,
121
+ "step": 5520
 
 
 
 
122
  },
123
  {
124
+ "epoch": 1.7,
125
+ "learning_rate": 8.517281440604125e-05,
126
+ "loss": 0.0021,
127
+ "step": 5865
128
  },
129
  {
130
+ "epoch": 1.8,
131
+ "learning_rate": 9.018297995933778e-05,
132
+ "loss": 0.0019,
133
+ "step": 6210
134
  },
135
  {
136
+ "epoch": 1.9,
137
+ "learning_rate": 9.519314551263433e-05,
138
+ "loss": 0.0019,
139
+ "step": 6555
140
  },
141
  {
142
+ "epoch": 2.0,
143
+ "eval_loss": 0.0008699939935468137,
144
+ "eval_max_distance": 3,
145
  "eval_mean_distance": 0,
146
+ "eval_runtime": 2.4743,
147
+ "eval_samples_per_second": 20.208,
148
+ "eval_steps_per_second": 0.808,
149
+ "step": 6886
 
 
 
 
 
 
 
 
 
 
 
 
150
  },
151
  {
152
+ "epoch": 2.0,
153
+ "learning_rate": 9.997740988156324e-05,
154
+ "loss": 0.0021,
155
+ "step": 6900
156
  },
157
  {
158
+ "epoch": 2.1,
159
+ "learning_rate": 9.942072482008585e-05,
160
+ "loss": 0.0014,
161
+ "step": 7245
162
  },
163
  {
164
+ "epoch": 2.2,
165
+ "learning_rate": 9.886403975860845e-05,
166
+ "loss": 0.002,
167
+ "step": 7590
 
 
 
 
168
  },
169
  {
170
+ "epoch": 2.3,
171
+ "learning_rate": 9.830735469713106e-05,
172
+ "loss": 0.0015,
173
+ "step": 7935
174
  },
175
  {
176
+ "epoch": 2.4,
177
+ "learning_rate": 9.775066963565367e-05,
178
+ "loss": 0.0012,
179
+ "step": 8280
180
  },
181
  {
182
+ "epoch": 2.51,
183
+ "learning_rate": 9.719398457417627e-05,
184
+ "loss": 0.0017,
185
+ "step": 8625
186
  },
187
  {
188
+ "epoch": 2.61,
189
+ "learning_rate": 9.663729951269888e-05,
190
+ "loss": 0.0013,
191
+ "step": 8970
 
 
 
 
192
  },
193
  {
194
+ "epoch": 2.71,
195
+ "learning_rate": 9.608061445122149e-05,
196
+ "loss": 0.0015,
197
+ "step": 9315
198
  },
199
  {
200
+ "epoch": 2.81,
201
+ "learning_rate": 9.552392938974409e-05,
202
+ "loss": 0.0018,
203
+ "step": 9660
204
  },
205
  {
206
+ "epoch": 2.91,
207
+ "learning_rate": 9.49672443282667e-05,
208
+ "loss": 0.0016,
209
+ "step": 10005
210
  },
211
  {
212
+ "epoch": 3.0,
213
+ "eval_loss": 0.0013113931054249406,
214
+ "eval_max_distance": 3,
215
  "eval_mean_distance": 0,
216
+ "eval_runtime": 2.4418,
217
+ "eval_samples_per_second": 20.477,
218
+ "eval_steps_per_second": 0.819,
219
+ "step": 10329
 
 
 
 
 
 
220
  },
221
  {
222
+ "epoch": 3.01,
223
+ "learning_rate": 9.44105592667893e-05,
224
+ "loss": 0.0013,
225
+ "step": 10350
 
 
 
 
 
 
226
  },
227
  {
228
+ "epoch": 3.11,
229
+ "learning_rate": 9.385387420531191e-05,
230
+ "loss": 0.0011,
231
+ "step": 10695
232
  },
233
  {
234
+ "epoch": 3.21,
235
+ "learning_rate": 9.329718914383452e-05,
236
+ "loss": 0.0011,
237
+ "step": 11040
 
 
 
 
238
  },
239
  {
240
+ "epoch": 3.31,
241
+ "learning_rate": 9.274050408235712e-05,
242
+ "loss": 0.0012,
243
+ "step": 11385
244
  },
245
  {
246
+ "epoch": 3.41,
247
+ "learning_rate": 9.218381902087973e-05,
248
+ "loss": 0.0013,
249
+ "step": 11730
250
  },
251
  {
252
+ "epoch": 3.51,
253
+ "learning_rate": 9.162713395940234e-05,
254
+ "loss": 0.0012,
255
+ "step": 12075
256
  },
257
  {
258
+ "epoch": 3.61,
259
+ "learning_rate": 9.107044889792494e-05,
260
+ "loss": 0.0012,
261
+ "step": 12420
 
 
 
 
262
  },
263
  {
264
+ "epoch": 3.71,
265
+ "learning_rate": 9.051376383644755e-05,
266
+ "loss": 0.0015,
267
+ "step": 12765
268
  },
269
  {
270
+ "epoch": 3.81,
271
+ "learning_rate": 8.995707877497016e-05,
272
+ "loss": 0.0011,
273
+ "step": 13110
274
  },
275
  {
276
+ "epoch": 3.91,
277
+ "learning_rate": 8.940039371349276e-05,
278
+ "loss": 0.0013,
279
+ "step": 13455
280
  },
281
  {
282
+ "epoch": 4.0,
283
+ "eval_loss": 0.0007705892785452306,
284
+ "eval_max_distance": 1,
285
  "eval_mean_distance": 0,
286
+ "eval_runtime": 2.3823,
287
+ "eval_samples_per_second": 20.989,
288
+ "eval_steps_per_second": 0.84,
289
+ "step": 13772
 
 
 
 
 
 
290
  },
291
  {
292
+ "epoch": 4.01,
293
+ "learning_rate": 8.884370865201537e-05,
294
+ "loss": 0.0012,
295
+ "step": 13800
 
 
 
 
 
 
296
  },
297
  {
298
+ "epoch": 4.11,
299
+ "learning_rate": 8.828702359053797e-05,
300
+ "loss": 0.0011,
301
+ "step": 14145
302
  },
303
  {
304
+ "epoch": 4.21,
305
+ "learning_rate": 8.773033852906058e-05,
306
+ "loss": 0.001,
307
+ "step": 14490
 
 
 
 
308
  },
309
  {
310
+ "epoch": 4.31,
311
+ "learning_rate": 8.717365346758319e-05,
312
+ "loss": 0.0009,
313
+ "step": 14835
314
  },
315
  {
316
+ "epoch": 4.41,
317
+ "learning_rate": 8.661696840610579e-05,
318
+ "loss": 0.001,
319
+ "step": 15180
320
  },
321
  {
322
+ "epoch": 4.51,
323
+ "learning_rate": 8.60602833446284e-05,
324
+ "loss": 0.001,
325
+ "step": 15525
326
  },
327
  {
328
+ "epoch": 4.61,
329
+ "learning_rate": 8.550359828315099e-05,
330
+ "loss": 0.001,
331
+ "step": 15870
 
 
 
 
332
  },
333
  {
334
+ "epoch": 4.71,
335
+ "learning_rate": 8.494691322167361e-05,
336
+ "loss": 0.0011,
337
+ "step": 16215
338
  },
339
  {
340
+ "epoch": 4.81,
341
+ "learning_rate": 8.439022816019622e-05,
342
+ "loss": 0.001,
343
+ "step": 16560
344
  },
345
  {
346
+ "epoch": 4.91,
347
+ "learning_rate": 8.383354309871882e-05,
348
+ "loss": 0.0012,
349
+ "step": 16905
350
  },
351
  {
352
+ "epoch": 5.0,
353
+ "eval_loss": 0.0010777737479656935,
354
+ "eval_max_distance": 3,
355
  "eval_mean_distance": 0,
356
+ "eval_runtime": 2.413,
357
+ "eval_samples_per_second": 20.721,
358
+ "eval_steps_per_second": 0.829,
359
+ "step": 17215
360
  },
361
  {
362
+ "epoch": 5.01,
363
+ "learning_rate": 8.327685803724142e-05,
364
+ "loss": 0.001,
365
+ "step": 17250
 
 
 
 
 
 
 
 
 
 
 
 
366
  },
367
  {
368
+ "epoch": 5.11,
369
+ "learning_rate": 8.272017297576404e-05,
370
+ "loss": 0.0008,
371
+ "step": 17595
372
  },
373
  {
374
+ "epoch": 5.21,
375
+ "learning_rate": 8.216348791428664e-05,
376
+ "loss": 0.0009,
377
+ "step": 17940
 
 
 
 
378
  },
379
  {
380
+ "epoch": 5.31,
381
+ "learning_rate": 8.160680285280925e-05,
382
+ "loss": 0.0009,
383
+ "step": 18285
384
  },
385
  {
386
+ "epoch": 5.41,
387
+ "learning_rate": 8.105011779133184e-05,
388
+ "loss": 0.0009,
389
+ "step": 18630
390
  },
391
  {
392
+ "epoch": 5.51,
393
+ "learning_rate": 8.049343272985446e-05,
394
+ "loss": 0.0009,
395
+ "step": 18975
396
  },
397
  {
398
+ "epoch": 5.61,
399
+ "learning_rate": 7.993674766837707e-05,
400
+ "loss": 0.0008,
401
+ "step": 19320
 
 
 
 
402
  },
403
  {
404
+ "epoch": 5.71,
405
+ "learning_rate": 7.938006260689967e-05,
406
+ "loss": 0.0009,
407
+ "step": 19665
408
  },
409
  {
410
+ "epoch": 5.81,
411
+ "learning_rate": 7.882337754542227e-05,
412
+ "loss": 0.0011,
413
+ "step": 20010
414
  },
415
  {
416
+ "epoch": 5.91,
417
+ "learning_rate": 7.826669248394489e-05,
418
+ "loss": 0.0009,
419
+ "step": 20355
420
  },
421
  {
422
+ "epoch": 6.0,
423
+ "eval_loss": 0.0008971834322437644,
424
+ "eval_max_distance": 3,
425
  "eval_mean_distance": 0,
426
+ "eval_runtime": 2.4223,
427
+ "eval_samples_per_second": 20.641,
428
+ "eval_steps_per_second": 0.826,
429
+ "step": 20658
430
  },
431
  {
432
+ "epoch": 6.01,
433
+ "learning_rate": 7.77100074224675e-05,
434
+ "loss": 0.001,
435
+ "step": 20700
 
 
 
 
 
 
 
 
 
 
 
 
436
  },
437
  {
438
+ "epoch": 6.11,
439
+ "learning_rate": 7.71533223609901e-05,
440
+ "loss": 0.0007,
441
+ "step": 21045
442
  },
443
  {
444
+ "epoch": 6.21,
445
+ "learning_rate": 7.65966372995127e-05,
446
+ "loss": 0.0008,
447
+ "step": 21390
 
 
 
 
448
  },
449
  {
450
+ "epoch": 6.31,
451
+ "learning_rate": 7.603995223803531e-05,
452
+ "loss": 0.0008,
453
+ "step": 21735
454
  },
455
  {
456
+ "epoch": 6.41,
457
+ "learning_rate": 7.548326717655792e-05,
458
+ "loss": 0.0008,
459
+ "step": 22080
460
  },
461
  {
462
+ "epoch": 6.51,
463
+ "learning_rate": 7.492658211508052e-05,
464
+ "loss": 0.0008,
465
+ "step": 22425
466
  },
467
  {
468
+ "epoch": 6.61,
469
+ "learning_rate": 7.436989705360313e-05,
470
+ "loss": 0.0008,
471
+ "step": 22770
 
 
 
 
472
  },
473
  {
474
+ "epoch": 6.71,
475
+ "learning_rate": 7.381321199212572e-05,
476
+ "loss": 0.0008,
477
+ "step": 23115
478
  },
479
  {
480
+ "epoch": 6.81,
481
+ "learning_rate": 7.325652693064834e-05,
482
+ "loss": 0.0008,
483
+ "step": 23460
484
  },
485
  {
486
+ "epoch": 6.91,
487
+ "learning_rate": 7.269984186917095e-05,
488
+ "loss": 0.0008,
489
+ "step": 23805
490
  },
491
  {
492
+ "epoch": 7.0,
493
+ "eval_loss": 0.0011104686418548226,
494
+ "eval_max_distance": 3,
495
  "eval_mean_distance": 0,
496
+ "eval_runtime": 2.4364,
497
+ "eval_samples_per_second": 20.522,
498
+ "eval_steps_per_second": 0.821,
499
+ "step": 24101
 
 
 
 
 
 
500
  },
501
  {
502
+ "epoch": 7.01,
503
+ "learning_rate": 7.214315680769356e-05,
504
+ "loss": 0.0007,
505
+ "step": 24150
 
 
 
 
 
 
506
  },
507
  {
508
+ "epoch": 7.11,
509
+ "learning_rate": 7.158647174621615e-05,
510
+ "loss": 0.0007,
511
+ "step": 24495
512
  },
513
  {
514
+ "epoch": 7.21,
515
+ "learning_rate": 7.102978668473877e-05,
516
+ "loss": 0.0007,
517
+ "step": 24840
 
 
 
 
518
  },
519
  {
520
+ "epoch": 7.31,
521
+ "learning_rate": 7.047310162326137e-05,
522
+ "loss": 0.0007,
523
+ "step": 25185
524
  },
525
  {
526
+ "epoch": 7.42,
527
+ "learning_rate": 6.991641656178398e-05,
528
+ "loss": 0.0007,
529
+ "step": 25530
530
  },
531
  {
532
+ "epoch": 7.52,
533
+ "learning_rate": 6.935973150030657e-05,
534
+ "loss": 0.0008,
535
+ "step": 25875
536
  },
537
  {
538
+ "epoch": 7.62,
539
+ "learning_rate": 6.88030464388292e-05,
540
+ "loss": 0.0006,
541
+ "step": 26220
 
 
 
 
542
  },
543
  {
544
+ "epoch": 7.72,
545
+ "learning_rate": 6.82463613773518e-05,
546
+ "loss": 0.0008,
547
+ "step": 26565
548
  },
549
  {
550
+ "epoch": 7.82,
551
+ "learning_rate": 6.76896763158744e-05,
552
+ "loss": 0.0008,
553
+ "step": 26910
554
  },
555
  {
556
+ "epoch": 7.92,
557
+ "learning_rate": 6.7132991254397e-05,
558
+ "loss": 0.0007,
559
+ "step": 27255
560
  },
561
  {
562
+ "epoch": 8.0,
563
+ "eval_loss": 0.0009568997193127871,
564
+ "eval_max_distance": 3,
565
  "eval_mean_distance": 0,
566
+ "eval_runtime": 2.353,
567
+ "eval_samples_per_second": 21.249,
568
+ "eval_steps_per_second": 0.85,
569
+ "step": 27544
 
 
 
 
 
 
570
  },
571
  {
572
+ "epoch": 8.02,
573
+ "learning_rate": 6.657630619291962e-05,
574
+ "loss": 0.0008,
575
+ "step": 27600
 
 
 
 
 
 
576
  },
577
  {
578
+ "epoch": 8.12,
579
+ "learning_rate": 6.601962113144223e-05,
580
+ "loss": 0.0006,
581
+ "step": 27945
582
  },
583
  {
584
+ "epoch": 8.22,
585
+ "learning_rate": 6.546293606996483e-05,
586
+ "loss": 0.0006,
587
+ "step": 28290
 
 
 
 
588
  },
589
  {
590
+ "epoch": 8.32,
591
+ "learning_rate": 6.490625100848742e-05,
592
+ "loss": 0.0007,
593
+ "step": 28635
594
  },
595
  {
596
+ "epoch": 8.42,
597
+ "learning_rate": 6.434956594701004e-05,
598
+ "loss": 0.0007,
599
+ "step": 28980
600
  },
601
  {
602
+ "epoch": 8.52,
603
+ "learning_rate": 6.379288088553265e-05,
604
+ "loss": 0.0008,
605
+ "step": 29325
606
  },
607
  {
608
+ "epoch": 8.62,
609
+ "learning_rate": 6.323619582405526e-05,
610
+ "loss": 0.0006,
611
+ "step": 29670
 
 
 
 
612
  },
613
  {
614
+ "epoch": 8.72,
615
+ "learning_rate": 6.267951076257785e-05,
616
+ "loss": 0.0006,
617
+ "step": 30015
618
  },
619
  {
620
+ "epoch": 8.82,
621
+ "learning_rate": 6.212282570110046e-05,
622
+ "loss": 0.0007,
623
+ "step": 30360
624
  },
625
  {
626
+ "epoch": 8.92,
627
+ "learning_rate": 6.156614063962308e-05,
628
+ "loss": 0.0006,
629
+ "step": 30705
630
  },
631
  {
632
+ "epoch": 9.0,
633
+ "eval_loss": 0.001235798466950655,
634
+ "eval_max_distance": 3,
635
  "eval_mean_distance": 0,
636
+ "eval_runtime": 2.3847,
637
+ "eval_samples_per_second": 20.967,
638
+ "eval_steps_per_second": 0.839,
639
+ "step": 30987
 
 
 
 
 
 
640
  },
641
  {
642
+ "epoch": 9.02,
643
+ "learning_rate": 6.100945557814568e-05,
644
+ "loss": 0.0007,
645
+ "step": 31050
 
 
 
 
 
 
646
  },
647
  {
648
+ "epoch": 9.12,
649
+ "learning_rate": 6.045277051666828e-05,
650
+ "loss": 0.0006,
651
+ "step": 31395
652
  },
653
  {
654
+ "epoch": 9.22,
655
+ "learning_rate": 5.989608545519089e-05,
656
+ "loss": 0.0005,
657
+ "step": 31740
 
 
 
 
658
  },
659
  {
660
+ "epoch": 9.32,
661
+ "learning_rate": 5.9339400393713494e-05,
662
+ "loss": 0.0006,
663
+ "step": 32085
664
  },
665
  {
666
+ "epoch": 9.42,
667
+ "learning_rate": 5.878271533223611e-05,
668
+ "loss": 0.0007,
669
+ "step": 32430
670
  },
671
  {
672
+ "epoch": 9.52,
673
+ "learning_rate": 5.822603027075871e-05,
674
+ "loss": 0.0006,
675
+ "step": 32775
676
  },
677
  {
678
+ "epoch": 9.62,
679
+ "learning_rate": 5.766934520928131e-05,
680
+ "loss": 0.0006,
681
+ "step": 33120
 
 
 
 
682
  },
683
  {
684
+ "epoch": 9.72,
685
+ "learning_rate": 5.711266014780392e-05,
686
+ "loss": 0.0006,
687
+ "step": 33465
688
  },
689
  {
690
+ "epoch": 9.82,
691
+ "learning_rate": 5.655597508632653e-05,
692
+ "loss": 0.0006,
693
+ "step": 33810
694
  },
695
  {
696
+ "epoch": 9.92,
697
+ "learning_rate": 5.599929002484914e-05,
698
+ "loss": 0.0006,
699
+ "step": 34155
700
  },
701
  {
702
+ "epoch": 10.0,
703
+ "eval_loss": 0.0008162627927958965,
704
+ "eval_max_distance": 3,
705
  "eval_mean_distance": 0,
706
+ "eval_runtime": 2.4197,
707
+ "eval_samples_per_second": 20.663,
708
+ "eval_steps_per_second": 0.827,
709
+ "step": 34430
 
 
 
 
 
 
710
  },
711
  {
712
+ "epoch": 10.02,
713
+ "learning_rate": 5.544260496337174e-05,
714
+ "loss": 0.0006,
715
+ "step": 34500
 
 
 
 
 
 
716
  },
717
  {
718
+ "epoch": 10.12,
719
+ "learning_rate": 5.4885919901894344e-05,
720
+ "loss": 0.0005,
721
+ "step": 34845
722
  },
723
  {
724
+ "epoch": 10.22,
725
+ "learning_rate": 5.432923484041696e-05,
726
+ "loss": 0.0006,
727
+ "step": 35190
 
 
 
 
728
  },
729
  {
730
+ "epoch": 10.32,
731
+ "learning_rate": 5.377254977893956e-05,
732
+ "loss": 0.0005,
733
+ "step": 35535
734
  },
735
  {
736
+ "epoch": 10.42,
737
+ "learning_rate": 5.321586471746216e-05,
738
+ "loss": 0.0005,
739
+ "step": 35880
740
  },
741
  {
742
+ "epoch": 10.52,
743
+ "learning_rate": 5.265917965598477e-05,
744
+ "loss": 0.0006,
745
+ "step": 36225
746
  },
747
  {
748
+ "epoch": 10.62,
749
+ "learning_rate": 5.2102494594507375e-05,
750
+ "loss": 0.0006,
751
+ "step": 36570
 
 
 
 
752
  },
753
  {
754
+ "epoch": 10.72,
755
+ "learning_rate": 5.154580953302999e-05,
756
+ "loss": 0.0006,
757
+ "step": 36915
758
  },
759
  {
760
+ "epoch": 10.82,
761
+ "learning_rate": 5.098912447155259e-05,
762
+ "loss": 0.0005,
763
+ "step": 37260
764
  },
765
  {
766
+ "epoch": 10.92,
767
+ "learning_rate": 5.0432439410075194e-05,
768
+ "loss": 0.0006,
769
+ "step": 37605
770
  },
771
  {
772
+ "epoch": 11.0,
773
+ "eval_loss": 0.000511307327542454,
774
+ "eval_max_distance": 0,
775
  "eval_mean_distance": 0,
776
+ "eval_runtime": 2.378,
777
+ "eval_samples_per_second": 21.026,
778
+ "eval_steps_per_second": 0.841,
779
+ "step": 37873
 
 
 
 
 
 
780
  },
781
  {
782
+ "epoch": 11.02,
783
+ "learning_rate": 4.98757543485978e-05,
784
+ "loss": 0.0005,
785
+ "step": 37950
 
 
 
 
 
 
786
  },
787
  {
788
+ "epoch": 11.12,
789
+ "learning_rate": 4.931906928712041e-05,
790
+ "loss": 0.0006,
791
+ "step": 38295
792
  },
793
  {
794
+ "epoch": 11.22,
795
+ "learning_rate": 4.876238422564301e-05,
796
+ "loss": 0.0005,
797
+ "step": 38640
 
 
 
 
798
  },
799
  {
800
+ "epoch": 11.32,
801
+ "learning_rate": 4.820569916416562e-05,
802
+ "loss": 0.0005,
803
+ "step": 38985
804
  },
805
  {
806
+ "epoch": 11.42,
807
+ "learning_rate": 4.7649014102688226e-05,
808
+ "loss": 0.0005,
809
+ "step": 39330
810
  },
811
  {
812
+ "epoch": 11.52,
813
+ "learning_rate": 4.709232904121083e-05,
814
+ "loss": 0.0005,
815
+ "step": 39675
816
  },
817
  {
818
+ "epoch": 11.62,
819
+ "learning_rate": 4.653564397973344e-05,
820
+ "loss": 0.0005,
821
+ "step": 40020
 
 
 
 
822
  },
823
  {
824
+ "epoch": 11.72,
825
+ "learning_rate": 4.5978958918256045e-05,
826
+ "loss": 0.0006,
827
+ "step": 40365
828
  },
829
  {
830
+ "epoch": 11.82,
831
+ "learning_rate": 4.542227385677865e-05,
832
+ "loss": 0.0007,
833
+ "step": 40710
834
  },
835
  {
836
+ "epoch": 11.92,
837
+ "learning_rate": 4.486558879530126e-05,
838
+ "loss": 0.0005,
839
+ "step": 41055
840
  },
841
  {
842
+ "epoch": 12.0,
843
+ "eval_loss": 0.0007161315297707915,
844
+ "eval_max_distance": 1,
845
  "eval_mean_distance": 0,
846
+ "eval_runtime": 2.3647,
847
+ "eval_samples_per_second": 21.145,
848
+ "eval_steps_per_second": 0.846,
849
+ "step": 41316
 
 
 
 
 
 
850
  },
851
  {
852
+ "epoch": 12.02,
853
+ "learning_rate": 4.430890373382386e-05,
854
+ "loss": 0.0005,
855
+ "step": 41400
 
 
 
 
 
 
856
  },
857
  {
858
+ "epoch": 12.12,
859
+ "learning_rate": 4.375221867234647e-05,
860
+ "loss": 0.0005,
861
+ "step": 41745
862
  },
863
  {
864
+ "epoch": 12.22,
865
+ "learning_rate": 4.3195533610869076e-05,
866
+ "loss": 0.0005,
867
+ "step": 42090
 
 
 
 
868
  },
869
  {
870
+ "epoch": 12.33,
871
+ "learning_rate": 4.263884854939168e-05,
872
+ "loss": 0.0006,
873
+ "step": 42435
874
  },
875
  {
876
+ "epoch": 12.43,
877
+ "learning_rate": 4.208216348791429e-05,
878
+ "loss": 0.0006,
879
+ "step": 42780
880
  },
881
  {
882
+ "epoch": 12.53,
883
+ "learning_rate": 4.1525478426436895e-05,
884
+ "loss": 0.0005,
885
+ "step": 43125
886
  },
887
  {
888
+ "epoch": 12.63,
889
+ "learning_rate": 4.09687933649595e-05,
890
+ "loss": 0.0005,
891
+ "step": 43470
 
 
 
 
892
  },
893
  {
894
+ "epoch": 12.73,
895
+ "learning_rate": 4.041210830348211e-05,
896
+ "loss": 0.0005,
897
+ "step": 43815
898
  },
899
  {
900
+ "epoch": 12.83,
901
+ "learning_rate": 3.9855423242004714e-05,
902
+ "loss": 0.0007,
903
+ "step": 44160
904
  },
905
  {
906
+ "epoch": 12.93,
907
+ "learning_rate": 3.929873818052732e-05,
908
+ "loss": 0.0004,
909
+ "step": 44505
910
  },
911
  {
912
+ "epoch": 13.0,
913
+ "eval_loss": 0.0006984297069720924,
914
+ "eval_max_distance": 0,
915
  "eval_mean_distance": 0,
916
+ "eval_runtime": 2.3638,
917
+ "eval_samples_per_second": 21.153,
918
+ "eval_steps_per_second": 0.846,
919
+ "step": 44759
 
 
 
 
 
 
 
 
 
 
 
 
920
  },
921
  {
922
+ "epoch": 13.03,
923
+ "learning_rate": 3.8742053119049926e-05,
924
+ "loss": 0.0005,
925
+ "step": 44850
926
  },
927
  {
928
+ "epoch": 13.13,
929
+ "learning_rate": 3.818536805757253e-05,
930
+ "loss": 0.0006,
931
+ "step": 45195
932
  },
933
  {
934
+ "epoch": 13.23,
935
+ "learning_rate": 3.762868299609514e-05,
936
+ "loss": 0.0005,
937
+ "step": 45540
 
 
 
 
938
  },
939
  {
940
+ "epoch": 13.33,
941
+ "learning_rate": 3.7071997934617745e-05,
942
+ "loss": 0.0005,
943
+ "step": 45885
944
  },
945
  {
946
+ "epoch": 13.43,
947
+ "learning_rate": 3.651531287314035e-05,
948
+ "loss": 0.0005,
949
+ "step": 46230
950
  },
951
  {
952
+ "epoch": 13.53,
953
+ "learning_rate": 3.595862781166296e-05,
954
+ "loss": 0.0006,
955
+ "step": 46575
956
  },
957
  {
958
+ "epoch": 13.63,
959
+ "learning_rate": 3.5401942750185564e-05,
960
+ "loss": 0.0005,
961
+ "step": 46920
 
 
 
 
962
  },
963
  {
964
+ "epoch": 13.73,
965
+ "learning_rate": 3.484525768870817e-05,
966
+ "loss": 0.0005,
967
+ "step": 47265
968
  },
969
  {
970
+ "epoch": 13.83,
971
+ "learning_rate": 3.4288572627230776e-05,
972
+ "loss": 0.0004,
973
+ "step": 47610
974
  },
975
  {
976
+ "epoch": 13.93,
977
+ "learning_rate": 3.373188756575338e-05,
978
+ "loss": 0.0006,
979
+ "step": 47955
980
  },
981
  {
982
+ "epoch": 14.0,
983
+ "eval_loss": 0.001082880888134241,
984
+ "eval_max_distance": 3,
985
  "eval_mean_distance": 0,
986
+ "eval_runtime": 2.3639,
987
+ "eval_samples_per_second": 21.152,
988
+ "eval_steps_per_second": 0.846,
989
+ "step": 48202
 
 
 
 
 
 
 
 
 
 
 
 
990
  },
991
  {
992
+ "epoch": 14.03,
993
+ "learning_rate": 3.317520250427599e-05,
994
+ "loss": 0.0005,
995
+ "step": 48300
996
  },
997
  {
998
+ "epoch": 14.13,
999
+ "learning_rate": 3.2618517442798595e-05,
1000
+ "loss": 0.0005,
1001
+ "step": 48645
1002
  },
1003
  {
1004
+ "epoch": 14.23,
1005
+ "learning_rate": 3.20618323813212e-05,
1006
+ "loss": 0.0005,
1007
+ "step": 48990
 
 
 
 
1008
  },
1009
  {
1010
+ "epoch": 14.33,
1011
+ "learning_rate": 3.150514731984381e-05,
1012
+ "loss": 0.0004,
1013
+ "step": 49335
1014
  },
1015
  {
1016
+ "epoch": 14.43,
1017
+ "learning_rate": 3.0948462258366414e-05,
1018
+ "loss": 0.0005,
1019
+ "step": 49680
1020
  },
1021
  {
1022
+ "epoch": 14.53,
1023
+ "learning_rate": 3.039177719688902e-05,
1024
+ "loss": 0.0004,
1025
+ "step": 50025
1026
  },
1027
  {
1028
+ "epoch": 14.63,
1029
+ "learning_rate": 2.9835092135411623e-05,
1030
+ "loss": 0.0005,
1031
+ "step": 50370
 
 
 
 
1032
  },
1033
  {
1034
+ "epoch": 14.73,
1035
+ "learning_rate": 2.9278407073934233e-05,
1036
+ "loss": 0.0005,
1037
+ "step": 50715
1038
  },
1039
  {
1040
+ "epoch": 14.83,
1041
+ "learning_rate": 2.8721722012456836e-05,
1042
+ "loss": 0.0005,
1043
+ "step": 51060
1044
  },
1045
  {
1046
+ "epoch": 14.93,
1047
+ "learning_rate": 2.8165036950979445e-05,
1048
+ "loss": 0.0005,
1049
+ "step": 51405
1050
  },
1051
  {
1052
+ "epoch": 15.0,
1053
+ "eval_loss": 0.0008490388281643391,
1054
+ "eval_max_distance": 3,
1055
  "eval_mean_distance": 0,
1056
+ "eval_runtime": 2.3846,
1057
+ "eval_samples_per_second": 20.967,
1058
+ "eval_steps_per_second": 0.839,
1059
+ "step": 51645
 
 
 
 
 
 
 
 
 
 
 
 
1060
  },
1061
  {
1062
+ "epoch": 15.03,
1063
+ "learning_rate": 2.760835188950205e-05,
1064
+ "loss": 0.0005,
1065
+ "step": 51750
1066
  },
1067
  {
1068
+ "epoch": 15.13,
1069
+ "learning_rate": 2.7051666828024658e-05,
1070
+ "loss": 0.0005,
1071
+ "step": 52095
1072
  },
1073
  {
1074
+ "epoch": 15.23,
1075
+ "learning_rate": 2.649498176654726e-05,
1076
+ "loss": 0.0004,
1077
+ "step": 52440
 
 
 
 
1078
  },
1079
  {
1080
+ "epoch": 15.33,
1081
+ "learning_rate": 2.593829670506987e-05,
1082
+ "loss": 0.0005,
1083
+ "step": 52785
1084
  },
1085
  {
1086
+ "epoch": 15.43,
1087
+ "learning_rate": 2.5381611643592473e-05,
1088
+ "loss": 0.0004,
1089
+ "step": 53130
1090
  },
1091
  {
1092
+ "epoch": 15.53,
1093
+ "learning_rate": 2.4824926582115083e-05,
1094
+ "loss": 0.0004,
1095
+ "step": 53475
1096
  },
1097
  {
1098
+ "epoch": 15.63,
1099
+ "learning_rate": 2.426824152063769e-05,
1100
+ "loss": 0.0005,
1101
+ "step": 53820
 
 
 
 
1102
  },
1103
  {
1104
+ "epoch": 15.73,
1105
+ "learning_rate": 2.3711556459160296e-05,
1106
+ "loss": 0.0005,
1107
+ "step": 54165
1108
  },
1109
  {
1110
+ "epoch": 15.83,
1111
+ "learning_rate": 2.3154871397682902e-05,
1112
+ "loss": 0.0004,
1113
+ "step": 54510
1114
  },
1115
  {
1116
+ "epoch": 15.93,
1117
+ "learning_rate": 2.2598186336205508e-05,
1118
+ "loss": 0.0005,
1119
+ "step": 54855
1120
  },
1121
  {
1122
+ "epoch": 16.0,
1123
+ "eval_loss": 0.0008035104838199914,
1124
+ "eval_max_distance": 3,
1125
  "eval_mean_distance": 0,
1126
+ "eval_runtime": 2.3835,
1127
+ "eval_samples_per_second": 20.978,
1128
+ "eval_steps_per_second": 0.839,
1129
+ "step": 55088
1130
  },
1131
  {
1132
+ "epoch": 16.03,
1133
+ "learning_rate": 2.2041501274728115e-05,
1134
+ "loss": 0.0005,
1135
+ "step": 55200
 
 
 
 
 
 
 
 
 
 
 
 
1136
  },
1137
  {
1138
+ "epoch": 16.13,
1139
+ "learning_rate": 2.1484816213250717e-05,
1140
+ "loss": 0.0004,
1141
+ "step": 55545
1142
  },
1143
  {
1144
+ "epoch": 16.23,
1145
+ "learning_rate": 2.0928131151773324e-05,
1146
+ "loss": 0.0004,
1147
+ "step": 55890
 
 
 
 
1148
  },
1149
  {
1150
+ "epoch": 16.33,
1151
+ "learning_rate": 2.037144609029593e-05,
1152
+ "loss": 0.0005,
1153
+ "step": 56235
1154
  },
1155
  {
1156
+ "epoch": 16.43,
1157
+ "learning_rate": 1.9814761028818536e-05,
1158
+ "loss": 0.0005,
1159
+ "step": 56580
1160
  },
1161
  {
1162
+ "epoch": 16.53,
1163
+ "learning_rate": 1.9258075967341143e-05,
1164
+ "loss": 0.0004,
1165
+ "step": 56925
1166
  },
1167
  {
1168
+ "epoch": 16.63,
1169
+ "learning_rate": 1.870139090586375e-05,
1170
+ "loss": 0.0005,
1171
+ "step": 57270
 
 
 
 
1172
  },
1173
  {
1174
+ "epoch": 16.73,
1175
+ "learning_rate": 1.8144705844386355e-05,
1176
+ "loss": 0.0004,
1177
+ "step": 57615
1178
  },
1179
  {
1180
+ "epoch": 16.83,
1181
+ "learning_rate": 1.758802078290896e-05,
1182
+ "loss": 0.0004,
1183
+ "step": 57960
1184
  },
1185
  {
1186
+ "epoch": 16.93,
1187
+ "learning_rate": 1.7031335721431568e-05,
1188
+ "loss": 0.0005,
1189
+ "step": 58305
1190
  },
1191
  {
1192
+ "epoch": 17.0,
1193
+ "eval_loss": 0.000769978913012892,
1194
+ "eval_max_distance": 3,
1195
  "eval_mean_distance": 0,
1196
+ "eval_runtime": 2.3886,
1197
+ "eval_samples_per_second": 20.933,
1198
+ "eval_steps_per_second": 0.837,
1199
+ "step": 58531
 
 
 
 
 
 
1200
  },
1201
  {
1202
+ "epoch": 17.03,
1203
+ "learning_rate": 1.6474650659954174e-05,
1204
+ "loss": 0.0005,
1205
+ "step": 58650
 
 
 
 
 
 
1206
  },
1207
  {
1208
+ "epoch": 17.13,
1209
+ "learning_rate": 1.591796559847678e-05,
1210
+ "loss": 0.0004,
1211
+ "step": 58995
1212
  },
1213
  {
1214
+ "epoch": 17.23,
1215
+ "learning_rate": 1.5361280536999387e-05,
1216
+ "loss": 0.0005,
1217
+ "step": 59340
 
 
 
 
1218
  },
1219
  {
1220
+ "epoch": 17.34,
1221
+ "learning_rate": 1.4804595475521993e-05,
1222
+ "loss": 0.0005,
1223
+ "step": 59685
1224
  },
1225
  {
1226
+ "epoch": 17.44,
1227
+ "learning_rate": 1.4247910414044599e-05,
1228
+ "loss": 0.0004,
1229
+ "step": 60030
1230
  },
1231
  {
1232
+ "epoch": 17.54,
1233
+ "learning_rate": 1.3691225352567205e-05,
1234
+ "loss": 0.0004,
1235
+ "step": 60375
1236
  },
1237
  {
1238
+ "epoch": 17.64,
1239
+ "learning_rate": 1.3134540291089812e-05,
1240
+ "loss": 0.0004,
1241
+ "step": 60720
 
 
 
 
1242
  },
1243
  {
1244
+ "epoch": 17.74,
1245
+ "learning_rate": 1.2577855229612418e-05,
1246
+ "loss": 0.0004,
1247
+ "step": 61065
1248
  },
1249
  {
1250
+ "epoch": 17.84,
1251
+ "learning_rate": 1.2021170168135024e-05,
1252
+ "loss": 0.0004,
1253
+ "step": 61410
1254
  },
1255
  {
1256
+ "epoch": 17.94,
1257
+ "learning_rate": 1.146448510665763e-05,
1258
+ "loss": 0.0004,
1259
+ "step": 61755
1260
  },
1261
  {
1262
+ "epoch": 18.0,
1263
+ "eval_loss": 0.0007153275073505938,
1264
+ "eval_max_distance": 3,
1265
  "eval_mean_distance": 0,
1266
+ "eval_runtime": 2.3591,
1267
+ "eval_samples_per_second": 21.194,
1268
+ "eval_steps_per_second": 0.848,
1269
+ "step": 61974
1270
  },
1271
  {
1272
+ "epoch": 18.04,
1273
+ "learning_rate": 1.0907800045180237e-05,
1274
+ "loss": 0.0003,
1275
+ "step": 62100
1276
  },
1277
  {
1278
+ "epoch": 18.14,
1279
+ "learning_rate": 1.0351114983702843e-05,
1280
+ "loss": 0.0004,
1281
+ "step": 62445
1282
  },
1283
  {
1284
+ "epoch": 18.24,
1285
+ "learning_rate": 9.79442992222545e-06,
1286
+ "loss": 0.0004,
1287
+ "step": 62790
1288
  },
1289
  {
1290
+ "epoch": 18.34,
1291
+ "learning_rate": 9.237744860748056e-06,
1292
+ "loss": 0.0004,
1293
+ "step": 63135
1294
  },
1295
  {
1296
+ "epoch": 18.44,
1297
+ "learning_rate": 8.681059799270662e-06,
1298
+ "loss": 0.0004,
1299
+ "step": 63480
 
 
 
 
1300
  },
1301
  {
1302
+ "epoch": 18.54,
1303
+ "learning_rate": 8.124374737793268e-06,
1304
+ "loss": 0.0004,
1305
+ "step": 63825
1306
  },
1307
  {
1308
+ "epoch": 18.64,
1309
+ "learning_rate": 7.5676896763158745e-06,
1310
  "loss": 0.0004,
1311
+ "step": 64170
1312
  },
1313
  {
1314
+ "epoch": 18.74,
1315
+ "learning_rate": 7.01100461483848e-06,
1316
  "loss": 0.0005,
1317
+ "step": 64515
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1318
  },
1319
  {
1320
+ "epoch": 18.84,
1321
+ "learning_rate": 6.454319553361088e-06,
1322
  "loss": 0.0004,
1323
+ "step": 64860
1324
  },
1325
  {
1326
+ "epoch": 18.94,
1327
+ "learning_rate": 5.897634491883693e-06,
1328
  "loss": 0.0004,
1329
+ "step": 65205
1330
  },
1331
  {
1332
+ "epoch": 19.0,
1333
+ "eval_loss": 0.0007207673625089228,
1334
+ "eval_max_distance": 3,
1335
  "eval_mean_distance": 0,
1336
+ "eval_runtime": 2.3645,
1337
+ "eval_samples_per_second": 21.147,
1338
+ "eval_steps_per_second": 0.846,
1339
+ "step": 65417
1340
  },
1341
  {
1342
+ "epoch": 19.04,
1343
+ "learning_rate": 5.3409494304063e-06,
1344
  "loss": 0.0004,
1345
+ "step": 65550
1346
  },
1347
  {
1348
+ "epoch": 19.14,
1349
+ "learning_rate": 4.784264368928906e-06,
1350
  "loss": 0.0004,
1351
+ "step": 65895
1352
  },
1353
  {
1354
+ "epoch": 19.24,
1355
+ "learning_rate": 4.227579307451512e-06,
1356
  "loss": 0.0004,
1357
+ "step": 66240
1358
  },
1359
  {
1360
+ "epoch": 19.34,
1361
+ "learning_rate": 3.6708942459741184e-06,
1362
  "loss": 0.0003,
1363
+ "step": 66585
1364
  },
1365
  {
1366
+ "epoch": 19.44,
1367
+ "learning_rate": 3.1142091844967247e-06,
1368
+ "loss": 0.0004,
1369
+ "step": 66930
 
 
 
 
 
 
 
 
 
 
1370
  },
1371
  {
1372
+ "epoch": 19.54,
1373
+ "learning_rate": 2.557524123019331e-06,
1374
  "loss": 0.0004,
1375
+ "step": 67275
1376
  },
1377
  {
1378
+ "epoch": 19.64,
1379
+ "learning_rate": 2.000839061541937e-06,
1380
+ "loss": 0.0005,
1381
+ "step": 67620
1382
  },
1383
  {
1384
+ "epoch": 19.74,
1385
+ "learning_rate": 1.4441540000645434e-06,
1386
+ "loss": 0.0005,
1387
+ "step": 67965
 
 
 
 
1388
  },
1389
  {
1390
+ "epoch": 19.84,
1391
+ "learning_rate": 8.874689385871494e-07,
1392
+ "loss": 0.0004,
1393
+ "step": 68310
1394
  },
1395
  {
1396
+ "epoch": 19.94,
1397
+ "learning_rate": 3.307838771097557e-07,
1398
+ "loss": 0.0005,
1399
+ "step": 68655
1400
  },
1401
  {
1402
+ "epoch": 20.0,
1403
+ "eval_loss": 0.0007007673266343772,
1404
+ "eval_max_distance": 3,
1405
+ "eval_mean_distance": 0,
1406
+ "eval_runtime": 2.376,
1407
+ "eval_samples_per_second": 21.044,
1408
+ "eval_steps_per_second": 0.842,
1409
+ "step": 68860
1410
  },
1411
  {
1412
+ "epoch": 20.0,
1413
+ "step": 68860,
1414
+ "total_flos": 4.025961075861504e+16,
1415
+ "train_loss": 0.0010230464439000071,
1416
+ "train_runtime": 6513.3096,
1417
+ "train_samples_per_second": 317.166,
1418
+ "train_steps_per_second": 10.572
 
 
 
 
 
 
 
 
 
 
1419
  }
1420
  ],
1421
+ "logging_steps": 345,
1422
+ "max_steps": 68860,
1423
+ "num_train_epochs": 20,
1424
+ "save_steps": 689,
1425
+ "total_flos": 4.025961075861504e+16,
1426
  "trial_name": null,
1427
  "trial_params": null
1428
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:100e47428886cfceeb4983e829afe7caff9578529dd77c77ba43967c2229d9ca
3
  size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:586ed9c845a3561a92f038b449b118f6c3411340e86cc6c2b3ab167a6c2d8141
3
  size 4091