Sven00 commited on
Commit
196d17d
·
1 Parent(s): f1be7a6

Delete redpj7B-lora-cnn-dailymail_6000_samples

Browse files
Files changed (28) hide show
  1. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/adapter_config.json +0 -16
  2. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/adapter_model.bin +0 -3
  3. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/optimizer.pt +0 -3
  4. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/rng_state.pth +0 -3
  5. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/scheduler.pt +0 -3
  6. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/trainer_state.json +0 -220
  7. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/training_args.bin +0 -3
  8. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/adapter_config.json +0 -16
  9. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/adapter_model.bin +0 -3
  10. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/optimizer.pt +0 -3
  11. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/rng_state.pth +0 -3
  12. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/scheduler.pt +0 -3
  13. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/trainer_state.json +0 -3008
  14. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/training_args.bin +0 -3
  15. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/adapter_config.json +0 -16
  16. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/adapter_model.bin +0 -3
  17. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/optimizer.pt +0 -3
  18. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/rng_state.pth +0 -3
  19. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/scheduler.pt +0 -3
  20. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/trainer_state.json +0 -3076
  21. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/training_args.bin +0 -3
  22. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/adapter_config.json +0 -16
  23. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/adapter_model.bin +0 -3
  24. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/special_tokens_map.json +0 -6
  25. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/tokenizer.json +0 -0
  26. redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/tokenizer_config.json +0 -10
  27. redpj7B-lora-cnn-dailymail_6000_samples/results/stdout.txt +0 -0
  28. redpj7B-lora-cnn-dailymail_6000_samples/script_fine_tuning.py +0 -170
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/adapter_config.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "base_model_name_or_path": "/domino/edv/afs-mrmc-data-store-rw/innovation/hf/RedPajama-INCITE-7B-Base",
3
- "bias": "none",
4
- "fan_in_fan_out": false,
5
- "inference_mode": true,
6
- "init_lora_weights": true,
7
- "lora_alpha": 16,
8
- "lora_dropout": 0.05,
9
- "modules_to_save": null,
10
- "peft_type": "LORA",
11
- "r": 8,
12
- "target_modules": [
13
- "query_key_value"
14
- ],
15
- "task_type": "CAUSAL_LM"
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7004b69a5e5338ee5dbc682271079d0cf6750a64f3dd06f0dabf0a4c8129f41b
3
- size 16800753
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e30ce31dcb0eb6fc197fa6178337af09855e350ceabc29748e3bfad2865b0828
3
- size 33592261
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf3c8c6a24cf587ccbaf1b3ebf006c31756fbbb284486b45c4447d10a3da99c3
3
- size 14575
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff6d8dcf74529ca71baf62714afafbea88dc48c81b1277347aebdfdd87c4533d
3
- size 627
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/trainer_state.json DELETED
@@ -1,220 +0,0 @@
1
- {
2
- "best_metric": 1.9635850191116333,
3
- "best_model_checkpoint": "./results/redpj7B-lora-cnn-dailymail-results_fine_tune_test/checkpoint-600",
4
- "epoch": 0.19900497512437812,
5
- "global_step": 600,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 0.00027082228116710874,
13
- "loss": 1.8654,
14
- "step": 20
15
- },
16
- {
17
- "epoch": 0.01,
18
- "learning_rate": 0.0002681697612732095,
19
- "loss": 1.8124,
20
- "step": 40
21
- },
22
- {
23
- "epoch": 0.02,
24
- "learning_rate": 0.0002655172413793103,
25
- "loss": 1.8231,
26
- "step": 60
27
- },
28
- {
29
- "epoch": 0.03,
30
- "learning_rate": 0.0002628647214854111,
31
- "loss": 1.7818,
32
- "step": 80
33
- },
34
- {
35
- "epoch": 0.03,
36
- "learning_rate": 0.00026021220159151194,
37
- "loss": 1.8446,
38
- "step": 100
39
- },
40
- {
41
- "epoch": 0.04,
42
- "learning_rate": 0.0002575596816976127,
43
- "loss": 1.9001,
44
- "step": 120
45
- },
46
- {
47
- "epoch": 0.05,
48
- "learning_rate": 0.0002549071618037135,
49
- "loss": 1.8073,
50
- "step": 140
51
- },
52
- {
53
- "epoch": 0.05,
54
- "learning_rate": 0.0002522546419098143,
55
- "loss": 1.8506,
56
- "step": 160
57
- },
58
- {
59
- "epoch": 0.06,
60
- "learning_rate": 0.0002496021220159151,
61
- "loss": 1.8471,
62
- "step": 180
63
- },
64
- {
65
- "epoch": 0.07,
66
- "learning_rate": 0.0002469496021220159,
67
- "loss": 1.8647,
68
- "step": 200
69
- },
70
- {
71
- "epoch": 0.07,
72
- "eval_loss": 1.966022253036499,
73
- "eval_runtime": 12002.3952,
74
- "eval_samples_per_second": 1.114,
75
- "eval_steps_per_second": 0.139,
76
- "step": 200
77
- },
78
- {
79
- "epoch": 0.07,
80
- "learning_rate": 0.00024429708222811666,
81
- "loss": 1.8578,
82
- "step": 220
83
- },
84
- {
85
- "epoch": 0.08,
86
- "learning_rate": 0.0002416445623342175,
87
- "loss": 1.8329,
88
- "step": 240
89
- },
90
- {
91
- "epoch": 0.09,
92
- "learning_rate": 0.0002389920424403183,
93
- "loss": 1.8119,
94
- "step": 260
95
- },
96
- {
97
- "epoch": 0.09,
98
- "learning_rate": 0.0002363395225464191,
99
- "loss": 1.8884,
100
- "step": 280
101
- },
102
- {
103
- "epoch": 0.1,
104
- "learning_rate": 0.00023368700265251986,
105
- "loss": 1.9077,
106
- "step": 300
107
- },
108
- {
109
- "epoch": 0.11,
110
- "learning_rate": 0.00023103448275862065,
111
- "loss": 1.8092,
112
- "step": 320
113
- },
114
- {
115
- "epoch": 0.11,
116
- "learning_rate": 0.00022838196286472146,
117
- "loss": 1.9237,
118
- "step": 340
119
- },
120
- {
121
- "epoch": 0.12,
122
- "learning_rate": 0.00022572944297082225,
123
- "loss": 1.8701,
124
- "step": 360
125
- },
126
- {
127
- "epoch": 0.13,
128
- "learning_rate": 0.00022307692307692306,
129
- "loss": 1.8933,
130
- "step": 380
131
- },
132
- {
133
- "epoch": 0.13,
134
- "learning_rate": 0.00022042440318302385,
135
- "loss": 1.8453,
136
- "step": 400
137
- },
138
- {
139
- "epoch": 0.13,
140
- "eval_loss": 1.9667036533355713,
141
- "eval_runtime": 11991.8036,
142
- "eval_samples_per_second": 1.115,
143
- "eval_steps_per_second": 0.139,
144
- "step": 400
145
- },
146
- {
147
- "epoch": 0.14,
148
- "learning_rate": 0.00021777188328912466,
149
- "loss": 1.8272,
150
- "step": 420
151
- },
152
- {
153
- "epoch": 0.15,
154
- "learning_rate": 0.00021511936339522545,
155
- "loss": 1.8149,
156
- "step": 440
157
- },
158
- {
159
- "epoch": 0.15,
160
- "learning_rate": 0.00021246684350132626,
161
- "loss": 1.8565,
162
- "step": 460
163
- },
164
- {
165
- "epoch": 0.16,
166
- "learning_rate": 0.00020981432360742705,
167
- "loss": 1.8139,
168
- "step": 480
169
- },
170
- {
171
- "epoch": 0.17,
172
- "learning_rate": 0.0002071618037135278,
173
- "loss": 1.802,
174
- "step": 500
175
- },
176
- {
177
- "epoch": 0.17,
178
- "learning_rate": 0.00020450928381962862,
179
- "loss": 1.8795,
180
- "step": 520
181
- },
182
- {
183
- "epoch": 0.18,
184
- "learning_rate": 0.0002018567639257294,
185
- "loss": 1.8621,
186
- "step": 540
187
- },
188
- {
189
- "epoch": 0.19,
190
- "learning_rate": 0.00019920424403183022,
191
- "loss": 1.8363,
192
- "step": 560
193
- },
194
- {
195
- "epoch": 0.19,
196
- "learning_rate": 0.000196551724137931,
197
- "loss": 1.8746,
198
- "step": 580
199
- },
200
- {
201
- "epoch": 0.2,
202
- "learning_rate": 0.00019389920424403182,
203
- "loss": 1.7763,
204
- "step": 600
205
- },
206
- {
207
- "epoch": 0.2,
208
- "eval_loss": 1.9635850191116333,
209
- "eval_runtime": 11976.5234,
210
- "eval_samples_per_second": 1.116,
211
- "eval_steps_per_second": 0.14,
212
- "step": 600
213
- }
214
- ],
215
- "max_steps": 9045,
216
- "num_train_epochs": 3,
217
- "total_flos": 3.74219846516736e+16,
218
- "trial_name": null,
219
- "trial_params": null
220
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-600/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5a78612ea8930d68eba4cb53d62254ccf547582e754aa049d169c3c11dd5fe4
3
- size 4027
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/adapter_config.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "base_model_name_or_path": "/domino/edv/afs-mrmc-data-store-rw/innovation/hf/RedPajama-INCITE-7B-Base",
3
- "bias": "none",
4
- "fan_in_fan_out": false,
5
- "inference_mode": true,
6
- "init_lora_weights": true,
7
- "lora_alpha": 16,
8
- "lora_dropout": 0.05,
9
- "modules_to_save": null,
10
- "peft_type": "LORA",
11
- "r": 8,
12
- "target_modules": [
13
- "query_key_value"
14
- ],
15
- "task_type": "CAUSAL_LM"
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:36c95157caaa0e8a49516175a38f9e3dfad6634df6c2d7fc47b2e3298cf4d68e
3
- size 16800753
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f45578cb0861cd43c16230fed120f8164d6858bd1387fe11181131bf8618591c
3
- size 33592261
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4113cbc0a2963d4c364a9d22be5bb0998af83f60ca6808727b9e33650e173f6
3
- size 14575
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c67c61a9d3974b17ff5914527d23045fa60ef4e1f1e7017cc41f7ea5686f8e9a
3
- size 627
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/trainer_state.json DELETED
@@ -1,3008 +0,0 @@
1
- {
2
- "best_metric": 1.9635850191116333,
3
- "best_model_checkpoint": "./results/redpj7B-lora-cnn-dailymail-results_fine_tune_test/checkpoint-600",
4
- "epoch": 2.9187396351575456,
5
- "global_step": 8800,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 0.00027082228116710874,
13
- "loss": 1.8654,
14
- "step": 20
15
- },
16
- {
17
- "epoch": 0.01,
18
- "learning_rate": 0.0002681697612732095,
19
- "loss": 1.8124,
20
- "step": 40
21
- },
22
- {
23
- "epoch": 0.02,
24
- "learning_rate": 0.0002655172413793103,
25
- "loss": 1.8231,
26
- "step": 60
27
- },
28
- {
29
- "epoch": 0.03,
30
- "learning_rate": 0.0002628647214854111,
31
- "loss": 1.7818,
32
- "step": 80
33
- },
34
- {
35
- "epoch": 0.03,
36
- "learning_rate": 0.00026021220159151194,
37
- "loss": 1.8446,
38
- "step": 100
39
- },
40
- {
41
- "epoch": 0.04,
42
- "learning_rate": 0.0002575596816976127,
43
- "loss": 1.9001,
44
- "step": 120
45
- },
46
- {
47
- "epoch": 0.05,
48
- "learning_rate": 0.0002549071618037135,
49
- "loss": 1.8073,
50
- "step": 140
51
- },
52
- {
53
- "epoch": 0.05,
54
- "learning_rate": 0.0002522546419098143,
55
- "loss": 1.8506,
56
- "step": 160
57
- },
58
- {
59
- "epoch": 0.06,
60
- "learning_rate": 0.0002496021220159151,
61
- "loss": 1.8471,
62
- "step": 180
63
- },
64
- {
65
- "epoch": 0.07,
66
- "learning_rate": 0.0002469496021220159,
67
- "loss": 1.8647,
68
- "step": 200
69
- },
70
- {
71
- "epoch": 0.07,
72
- "eval_loss": 1.966022253036499,
73
- "eval_runtime": 12002.3952,
74
- "eval_samples_per_second": 1.114,
75
- "eval_steps_per_second": 0.139,
76
- "step": 200
77
- },
78
- {
79
- "epoch": 0.07,
80
- "learning_rate": 0.00024429708222811666,
81
- "loss": 1.8578,
82
- "step": 220
83
- },
84
- {
85
- "epoch": 0.08,
86
- "learning_rate": 0.0002416445623342175,
87
- "loss": 1.8329,
88
- "step": 240
89
- },
90
- {
91
- "epoch": 0.09,
92
- "learning_rate": 0.0002389920424403183,
93
- "loss": 1.8119,
94
- "step": 260
95
- },
96
- {
97
- "epoch": 0.09,
98
- "learning_rate": 0.0002363395225464191,
99
- "loss": 1.8884,
100
- "step": 280
101
- },
102
- {
103
- "epoch": 0.1,
104
- "learning_rate": 0.00023368700265251986,
105
- "loss": 1.9077,
106
- "step": 300
107
- },
108
- {
109
- "epoch": 0.11,
110
- "learning_rate": 0.00023103448275862065,
111
- "loss": 1.8092,
112
- "step": 320
113
- },
114
- {
115
- "epoch": 0.11,
116
- "learning_rate": 0.00022838196286472146,
117
- "loss": 1.9237,
118
- "step": 340
119
- },
120
- {
121
- "epoch": 0.12,
122
- "learning_rate": 0.00022572944297082225,
123
- "loss": 1.8701,
124
- "step": 360
125
- },
126
- {
127
- "epoch": 0.13,
128
- "learning_rate": 0.00022307692307692306,
129
- "loss": 1.8933,
130
- "step": 380
131
- },
132
- {
133
- "epoch": 0.13,
134
- "learning_rate": 0.00022042440318302385,
135
- "loss": 1.8453,
136
- "step": 400
137
- },
138
- {
139
- "epoch": 0.13,
140
- "eval_loss": 1.9667036533355713,
141
- "eval_runtime": 11991.8036,
142
- "eval_samples_per_second": 1.115,
143
- "eval_steps_per_second": 0.139,
144
- "step": 400
145
- },
146
- {
147
- "epoch": 0.14,
148
- "learning_rate": 0.00021777188328912466,
149
- "loss": 1.8272,
150
- "step": 420
151
- },
152
- {
153
- "epoch": 0.15,
154
- "learning_rate": 0.00021511936339522545,
155
- "loss": 1.8149,
156
- "step": 440
157
- },
158
- {
159
- "epoch": 0.15,
160
- "learning_rate": 0.00021246684350132626,
161
- "loss": 1.8565,
162
- "step": 460
163
- },
164
- {
165
- "epoch": 0.16,
166
- "learning_rate": 0.00020981432360742705,
167
- "loss": 1.8139,
168
- "step": 480
169
- },
170
- {
171
- "epoch": 0.17,
172
- "learning_rate": 0.0002071618037135278,
173
- "loss": 1.802,
174
- "step": 500
175
- },
176
- {
177
- "epoch": 0.17,
178
- "learning_rate": 0.00020450928381962862,
179
- "loss": 1.8795,
180
- "step": 520
181
- },
182
- {
183
- "epoch": 0.18,
184
- "learning_rate": 0.0002018567639257294,
185
- "loss": 1.8621,
186
- "step": 540
187
- },
188
- {
189
- "epoch": 0.19,
190
- "learning_rate": 0.00019920424403183022,
191
- "loss": 1.8363,
192
- "step": 560
193
- },
194
- {
195
- "epoch": 0.19,
196
- "learning_rate": 0.000196551724137931,
197
- "loss": 1.8746,
198
- "step": 580
199
- },
200
- {
201
- "epoch": 0.2,
202
- "learning_rate": 0.00019389920424403182,
203
- "loss": 1.7763,
204
- "step": 600
205
- },
206
- {
207
- "epoch": 0.2,
208
- "eval_loss": 1.9635850191116333,
209
- "eval_runtime": 11976.5234,
210
- "eval_samples_per_second": 1.116,
211
- "eval_steps_per_second": 0.14,
212
- "step": 600
213
- },
214
- {
215
- "epoch": 0.21,
216
- "learning_rate": 0.0001912466843501326,
217
- "loss": 1.8423,
218
- "step": 620
219
- },
220
- {
221
- "epoch": 0.21,
222
- "learning_rate": 0.00018859416445623343,
223
- "loss": 1.8998,
224
- "step": 640
225
- },
226
- {
227
- "epoch": 0.22,
228
- "learning_rate": 0.0001859416445623342,
229
- "loss": 1.8131,
230
- "step": 660
231
- },
232
- {
233
- "epoch": 0.23,
234
- "learning_rate": 0.00018328912466843497,
235
- "loss": 1.8656,
236
- "step": 680
237
- },
238
- {
239
- "epoch": 0.23,
240
- "learning_rate": 0.00018063660477453579,
241
- "loss": 1.8181,
242
- "step": 700
243
- },
244
- {
245
- "epoch": 0.24,
246
- "learning_rate": 0.00017798408488063657,
247
- "loss": 1.8425,
248
- "step": 720
249
- },
250
- {
251
- "epoch": 0.25,
252
- "learning_rate": 0.00017533156498673739,
253
- "loss": 1.7883,
254
- "step": 740
255
- },
256
- {
257
- "epoch": 0.25,
258
- "learning_rate": 0.00017267904509283817,
259
- "loss": 1.8469,
260
- "step": 760
261
- },
262
- {
263
- "epoch": 0.26,
264
- "learning_rate": 0.000170026525198939,
265
- "loss": 1.8447,
266
- "step": 780
267
- },
268
- {
269
- "epoch": 0.27,
270
- "learning_rate": 0.00016737400530503977,
271
- "loss": 1.7872,
272
- "step": 800
273
- },
274
- {
275
- "epoch": 0.27,
276
- "eval_loss": 1.9704641103744507,
277
- "eval_runtime": 11963.7945,
278
- "eval_samples_per_second": 1.117,
279
- "eval_steps_per_second": 0.14,
280
- "step": 800
281
- },
282
- {
283
- "epoch": 0.27,
284
- "learning_rate": 0.0001647214854111406,
285
- "loss": 1.8032,
286
- "step": 820
287
- },
288
- {
289
- "epoch": 0.28,
290
- "learning_rate": 0.00016206896551724137,
291
- "loss": 1.8709,
292
- "step": 840
293
- },
294
- {
295
- "epoch": 0.29,
296
- "learning_rate": 0.00015941644562334216,
297
- "loss": 1.8801,
298
- "step": 860
299
- },
300
- {
301
- "epoch": 0.29,
302
- "learning_rate": 0.00015676392572944298,
303
- "loss": 1.8535,
304
- "step": 880
305
- },
306
- {
307
- "epoch": 0.3,
308
- "learning_rate": 0.00015411140583554373,
309
- "loss": 1.8631,
310
- "step": 900
311
- },
312
- {
313
- "epoch": 0.31,
314
- "learning_rate": 0.00015145888594164455,
315
- "loss": 1.752,
316
- "step": 920
317
- },
318
- {
319
- "epoch": 0.31,
320
- "learning_rate": 0.00014880636604774534,
321
- "loss": 1.8705,
322
- "step": 940
323
- },
324
- {
325
- "epoch": 0.32,
326
- "learning_rate": 0.00014615384615384615,
327
- "loss": 1.8301,
328
- "step": 960
329
- },
330
- {
331
- "epoch": 0.33,
332
- "learning_rate": 0.00014350132625994694,
333
- "loss": 1.7892,
334
- "step": 980
335
- },
336
- {
337
- "epoch": 0.33,
338
- "learning_rate": 0.00014084880636604772,
339
- "loss": 1.8671,
340
- "step": 1000
341
- },
342
- {
343
- "epoch": 0.33,
344
- "eval_loss": 1.9677250385284424,
345
- "eval_runtime": 11951.074,
346
- "eval_samples_per_second": 1.119,
347
- "eval_steps_per_second": 0.14,
348
- "step": 1000
349
- },
350
- {
351
- "epoch": 0.34,
352
- "learning_rate": 0.00013819628647214854,
353
- "loss": 1.8168,
354
- "step": 1020
355
- },
356
- {
357
- "epoch": 0.34,
358
- "learning_rate": 0.00013554376657824932,
359
- "loss": 1.8268,
360
- "step": 1040
361
- },
362
- {
363
- "epoch": 0.35,
364
- "learning_rate": 0.00013289124668435014,
365
- "loss": 1.8073,
366
- "step": 1060
367
- },
368
- {
369
- "epoch": 0.36,
370
- "learning_rate": 0.0001302387267904509,
371
- "loss": 1.9313,
372
- "step": 1080
373
- },
374
- {
375
- "epoch": 0.36,
376
- "learning_rate": 0.0001275862068965517,
377
- "loss": 1.9097,
378
- "step": 1100
379
- },
380
- {
381
- "epoch": 0.37,
382
- "learning_rate": 0.0001249336870026525,
383
- "loss": 1.9032,
384
- "step": 1120
385
- },
386
- {
387
- "epoch": 0.38,
388
- "learning_rate": 0.0001222811671087533,
389
- "loss": 1.8207,
390
- "step": 1140
391
- },
392
- {
393
- "epoch": 0.38,
394
- "learning_rate": 0.0001196286472148541,
395
- "loss": 1.8815,
396
- "step": 1160
397
- },
398
- {
399
- "epoch": 0.39,
400
- "learning_rate": 0.0001169761273209549,
401
- "loss": 1.8297,
402
- "step": 1180
403
- },
404
- {
405
- "epoch": 0.4,
406
- "learning_rate": 0.0001143236074270557,
407
- "loss": 1.8899,
408
- "step": 1200
409
- },
410
- {
411
- "epoch": 0.4,
412
- "eval_loss": 1.9722812175750732,
413
- "eval_runtime": 11951.1639,
414
- "eval_samples_per_second": 1.119,
415
- "eval_steps_per_second": 0.14,
416
- "step": 1200
417
- },
418
- {
419
- "epoch": 0.4,
420
- "learning_rate": 0.0001116710875331565,
421
- "loss": 1.8803,
422
- "step": 1220
423
- },
424
- {
425
- "epoch": 0.41,
426
- "learning_rate": 0.00010901856763925729,
427
- "loss": 1.8548,
428
- "step": 1240
429
- },
430
- {
431
- "epoch": 0.42,
432
- "learning_rate": 0.00010636604774535807,
433
- "loss": 1.8498,
434
- "step": 1260
435
- },
436
- {
437
- "epoch": 0.42,
438
- "learning_rate": 0.00010371352785145887,
439
- "loss": 1.8057,
440
- "step": 1280
441
- },
442
- {
443
- "epoch": 0.43,
444
- "learning_rate": 0.00010106100795755967,
445
- "loss": 1.8572,
446
- "step": 1300
447
- },
448
- {
449
- "epoch": 0.44,
450
- "learning_rate": 9.840848806366047e-05,
451
- "loss": 1.8469,
452
- "step": 1320
453
- },
454
- {
455
- "epoch": 0.44,
456
- "learning_rate": 9.575596816976126e-05,
457
- "loss": 1.8405,
458
- "step": 1340
459
- },
460
- {
461
- "epoch": 0.45,
462
- "learning_rate": 9.310344827586206e-05,
463
- "loss": 1.8193,
464
- "step": 1360
465
- },
466
- {
467
- "epoch": 0.46,
468
- "learning_rate": 9.045092838196286e-05,
469
- "loss": 1.7731,
470
- "step": 1380
471
- },
472
- {
473
- "epoch": 0.46,
474
- "learning_rate": 8.779840848806366e-05,
475
- "loss": 1.8329,
476
- "step": 1400
477
- },
478
- {
479
- "epoch": 0.46,
480
- "eval_loss": 1.9736511707305908,
481
- "eval_runtime": 11956.5916,
482
- "eval_samples_per_second": 1.118,
483
- "eval_steps_per_second": 0.14,
484
- "step": 1400
485
- },
486
- {
487
- "epoch": 0.47,
488
- "learning_rate": 8.514588859416444e-05,
489
- "loss": 1.8872,
490
- "step": 1420
491
- },
492
- {
493
- "epoch": 0.48,
494
- "learning_rate": 8.249336870026524e-05,
495
- "loss": 1.8511,
496
- "step": 1440
497
- },
498
- {
499
- "epoch": 0.48,
500
- "learning_rate": 7.984084880636604e-05,
501
- "loss": 1.7523,
502
- "step": 1460
503
- },
504
- {
505
- "epoch": 0.49,
506
- "learning_rate": 7.718832891246684e-05,
507
- "loss": 1.8619,
508
- "step": 1480
509
- },
510
- {
511
- "epoch": 0.5,
512
- "learning_rate": 7.453580901856764e-05,
513
- "loss": 1.8363,
514
- "step": 1500
515
- },
516
- {
517
- "epoch": 0.5,
518
- "learning_rate": 7.188328912466844e-05,
519
- "loss": 1.7745,
520
- "step": 1520
521
- },
522
- {
523
- "epoch": 0.51,
524
- "learning_rate": 6.923076923076922e-05,
525
- "loss": 1.8164,
526
- "step": 1540
527
- },
528
- {
529
- "epoch": 0.52,
530
- "learning_rate": 6.657824933687002e-05,
531
- "loss": 1.7997,
532
- "step": 1560
533
- },
534
- {
535
- "epoch": 0.52,
536
- "learning_rate": 6.392572944297081e-05,
537
- "loss": 1.8377,
538
- "step": 1580
539
- },
540
- {
541
- "epoch": 0.53,
542
- "learning_rate": 6.127320954907161e-05,
543
- "loss": 1.851,
544
- "step": 1600
545
- },
546
- {
547
- "epoch": 0.53,
548
- "eval_loss": 1.9729183912277222,
549
- "eval_runtime": 11970.5767,
550
- "eval_samples_per_second": 1.117,
551
- "eval_steps_per_second": 0.14,
552
- "step": 1600
553
- },
554
- {
555
- "epoch": 0.54,
556
- "learning_rate": 5.862068965517241e-05,
557
- "loss": 1.8382,
558
- "step": 1620
559
- },
560
- {
561
- "epoch": 0.54,
562
- "learning_rate": 5.59681697612732e-05,
563
- "loss": 1.8863,
564
- "step": 1640
565
- },
566
- {
567
- "epoch": 0.55,
568
- "learning_rate": 5.3315649867374e-05,
569
- "loss": 1.762,
570
- "step": 1660
571
- },
572
- {
573
- "epoch": 0.56,
574
- "learning_rate": 5.06631299734748e-05,
575
- "loss": 1.822,
576
- "step": 1680
577
- },
578
- {
579
- "epoch": 0.56,
580
- "learning_rate": 4.801061007957559e-05,
581
- "loss": 1.7776,
582
- "step": 1700
583
- },
584
- {
585
- "epoch": 0.57,
586
- "learning_rate": 4.5358090185676386e-05,
587
- "loss": 1.809,
588
- "step": 1720
589
- },
590
- {
591
- "epoch": 0.58,
592
- "learning_rate": 4.2705570291777186e-05,
593
- "loss": 1.912,
594
- "step": 1740
595
- },
596
- {
597
- "epoch": 0.58,
598
- "learning_rate": 4.005305039787798e-05,
599
- "loss": 1.7557,
600
- "step": 1760
601
- },
602
- {
603
- "epoch": 0.59,
604
- "learning_rate": 3.7400530503978774e-05,
605
- "loss": 1.8232,
606
- "step": 1780
607
- },
608
- {
609
- "epoch": 0.6,
610
- "learning_rate": 3.4748010610079574e-05,
611
- "loss": 1.8613,
612
- "step": 1800
613
- },
614
- {
615
- "epoch": 0.6,
616
- "eval_loss": 1.9704504013061523,
617
- "eval_runtime": 11980.5334,
618
- "eval_samples_per_second": 1.116,
619
- "eval_steps_per_second": 0.139,
620
- "step": 1800
621
- },
622
- {
623
- "epoch": 0.6,
624
- "learning_rate": 3.209549071618037e-05,
625
- "loss": 1.8378,
626
- "step": 1820
627
- },
628
- {
629
- "epoch": 0.61,
630
- "learning_rate": 2.9442970822281164e-05,
631
- "loss": 1.8871,
632
- "step": 1840
633
- },
634
- {
635
- "epoch": 0.62,
636
- "learning_rate": 2.679045092838196e-05,
637
- "loss": 1.8821,
638
- "step": 1860
639
- },
640
- {
641
- "epoch": 0.62,
642
- "learning_rate": 2.4137931034482755e-05,
643
- "loss": 1.8487,
644
- "step": 1880
645
- },
646
- {
647
- "epoch": 0.63,
648
- "learning_rate": 2.1485411140583555e-05,
649
- "loss": 1.8439,
650
- "step": 1900
651
- },
652
- {
653
- "epoch": 0.64,
654
- "learning_rate": 1.883289124668435e-05,
655
- "loss": 1.8469,
656
- "step": 1920
657
- },
658
- {
659
- "epoch": 0.64,
660
- "learning_rate": 1.6180371352785142e-05,
661
- "loss": 1.7797,
662
- "step": 1940
663
- },
664
- {
665
- "epoch": 0.65,
666
- "learning_rate": 1.352785145888594e-05,
667
- "loss": 1.7795,
668
- "step": 1960
669
- },
670
- {
671
- "epoch": 0.66,
672
- "learning_rate": 1.0875331564986736e-05,
673
- "loss": 1.8483,
674
- "step": 1980
675
- },
676
- {
677
- "epoch": 0.66,
678
- "learning_rate": 8.222811671087533e-06,
679
- "loss": 1.8321,
680
- "step": 2000
681
- },
682
- {
683
- "epoch": 0.66,
684
- "eval_loss": 1.9704707860946655,
685
- "eval_runtime": 11964.4776,
686
- "eval_samples_per_second": 1.117,
687
- "eval_steps_per_second": 0.14,
688
- "step": 2000
689
- },
690
- {
691
- "epoch": 0.67,
692
- "learning_rate": 5.5702917771883284e-06,
693
- "loss": 1.7708,
694
- "step": 2020
695
- },
696
- {
697
- "epoch": 0.68,
698
- "learning_rate": 2.9177718832891245e-06,
699
- "loss": 1.8731,
700
- "step": 2040
701
- },
702
- {
703
- "epoch": 0.68,
704
- "learning_rate": 2.652519893899204e-07,
705
- "loss": 1.8181,
706
- "step": 2060
707
- },
708
- {
709
- "epoch": 0.69,
710
- "learning_rate": 0.0,
711
- "loss": 1.8692,
712
- "step": 2080
713
- },
714
- {
715
- "epoch": 0.7,
716
- "learning_rate": 0.0,
717
- "loss": 1.8602,
718
- "step": 2100
719
- },
720
- {
721
- "epoch": 0.7,
722
- "learning_rate": 0.0,
723
- "loss": 1.8891,
724
- "step": 2120
725
- },
726
- {
727
- "epoch": 0.71,
728
- "learning_rate": 0.0,
729
- "loss": 1.8205,
730
- "step": 2140
731
- },
732
- {
733
- "epoch": 0.72,
734
- "learning_rate": 0.0,
735
- "loss": 1.8242,
736
- "step": 2160
737
- },
738
- {
739
- "epoch": 0.72,
740
- "learning_rate": 0.0,
741
- "loss": 1.8661,
742
- "step": 2180
743
- },
744
- {
745
- "epoch": 0.73,
746
- "learning_rate": 0.0,
747
- "loss": 1.7982,
748
- "step": 2200
749
- },
750
- {
751
- "epoch": 0.73,
752
- "eval_loss": 1.970503568649292,
753
- "eval_runtime": 11950.1314,
754
- "eval_samples_per_second": 1.119,
755
- "eval_steps_per_second": 0.14,
756
- "step": 2200
757
- },
758
- {
759
- "epoch": 0.74,
760
- "learning_rate": 0.0,
761
- "loss": 1.7967,
762
- "step": 2220
763
- },
764
- {
765
- "epoch": 0.74,
766
- "learning_rate": 0.0,
767
- "loss": 1.8346,
768
- "step": 2240
769
- },
770
- {
771
- "epoch": 0.75,
772
- "learning_rate": 0.0,
773
- "loss": 1.8242,
774
- "step": 2260
775
- },
776
- {
777
- "epoch": 0.76,
778
- "learning_rate": 0.0,
779
- "loss": 1.8429,
780
- "step": 2280
781
- },
782
- {
783
- "epoch": 0.76,
784
- "learning_rate": 0.0,
785
- "loss": 1.8795,
786
- "step": 2300
787
- },
788
- {
789
- "epoch": 0.77,
790
- "learning_rate": 0.0,
791
- "loss": 1.8967,
792
- "step": 2320
793
- },
794
- {
795
- "epoch": 0.78,
796
- "learning_rate": 0.0,
797
- "loss": 1.815,
798
- "step": 2340
799
- },
800
- {
801
- "epoch": 0.78,
802
- "learning_rate": 0.0,
803
- "loss": 1.8165,
804
- "step": 2360
805
- },
806
- {
807
- "epoch": 0.79,
808
- "learning_rate": 0.0,
809
- "loss": 1.838,
810
- "step": 2380
811
- },
812
- {
813
- "epoch": 0.8,
814
- "learning_rate": 0.0,
815
- "loss": 1.7798,
816
- "step": 2400
817
- },
818
- {
819
- "epoch": 0.8,
820
- "eval_loss": 1.970503568649292,
821
- "eval_runtime": 11951.0888,
822
- "eval_samples_per_second": 1.119,
823
- "eval_steps_per_second": 0.14,
824
- "step": 2400
825
- },
826
- {
827
- "epoch": 0.8,
828
- "learning_rate": 0.0,
829
- "loss": 1.7999,
830
- "step": 2420
831
- },
832
- {
833
- "epoch": 0.81,
834
- "learning_rate": 0.0,
835
- "loss": 1.7957,
836
- "step": 2440
837
- },
838
- {
839
- "epoch": 0.82,
840
- "learning_rate": 0.0,
841
- "loss": 1.8306,
842
- "step": 2460
843
- },
844
- {
845
- "epoch": 0.82,
846
- "learning_rate": 0.0,
847
- "loss": 1.87,
848
- "step": 2480
849
- },
850
- {
851
- "epoch": 0.83,
852
- "learning_rate": 0.0,
853
- "loss": 1.8046,
854
- "step": 2500
855
- },
856
- {
857
- "epoch": 0.84,
858
- "learning_rate": 0.0,
859
- "loss": 1.76,
860
- "step": 2520
861
- },
862
- {
863
- "epoch": 0.84,
864
- "learning_rate": 0.0,
865
- "loss": 1.869,
866
- "step": 2540
867
- },
868
- {
869
- "epoch": 0.85,
870
- "learning_rate": 0.0,
871
- "loss": 1.8684,
872
- "step": 2560
873
- },
874
- {
875
- "epoch": 0.86,
876
- "learning_rate": 0.0,
877
- "loss": 1.8757,
878
- "step": 2580
879
- },
880
- {
881
- "epoch": 0.86,
882
- "learning_rate": 0.0,
883
- "loss": 1.868,
884
- "step": 2600
885
- },
886
- {
887
- "epoch": 0.86,
888
- "eval_loss": 1.970503568649292,
889
- "eval_runtime": 11952.488,
890
- "eval_samples_per_second": 1.118,
891
- "eval_steps_per_second": 0.14,
892
- "step": 2600
893
- },
894
- {
895
- "epoch": 0.87,
896
- "learning_rate": 0.0,
897
- "loss": 1.8228,
898
- "step": 2620
899
- },
900
- {
901
- "epoch": 0.88,
902
- "learning_rate": 0.0,
903
- "loss": 1.8721,
904
- "step": 2640
905
- },
906
- {
907
- "epoch": 0.88,
908
- "learning_rate": 0.0,
909
- "loss": 1.8557,
910
- "step": 2660
911
- },
912
- {
913
- "epoch": 0.89,
914
- "learning_rate": 0.0,
915
- "loss": 1.8545,
916
- "step": 2680
917
- },
918
- {
919
- "epoch": 0.9,
920
- "learning_rate": 0.0,
921
- "loss": 1.8366,
922
- "step": 2700
923
- },
924
- {
925
- "epoch": 0.9,
926
- "learning_rate": 0.0,
927
- "loss": 1.8381,
928
- "step": 2720
929
- },
930
- {
931
- "epoch": 0.91,
932
- "learning_rate": 0.0,
933
- "loss": 1.8305,
934
- "step": 2740
935
- },
936
- {
937
- "epoch": 0.92,
938
- "learning_rate": 0.0,
939
- "loss": 1.8162,
940
- "step": 2760
941
- },
942
- {
943
- "epoch": 0.92,
944
- "learning_rate": 0.0,
945
- "loss": 1.8109,
946
- "step": 2780
947
- },
948
- {
949
- "epoch": 0.93,
950
- "learning_rate": 0.0,
951
- "loss": 1.8694,
952
- "step": 2800
953
- },
954
- {
955
- "epoch": 0.93,
956
- "eval_loss": 1.970503568649292,
957
- "eval_runtime": 11969.9504,
958
- "eval_samples_per_second": 1.117,
959
- "eval_steps_per_second": 0.14,
960
- "step": 2800
961
- },
962
- {
963
- "epoch": 0.94,
964
- "learning_rate": 0.0,
965
- "loss": 1.8201,
966
- "step": 2820
967
- },
968
- {
969
- "epoch": 0.94,
970
- "learning_rate": 0.0,
971
- "loss": 1.848,
972
- "step": 2840
973
- },
974
- {
975
- "epoch": 0.95,
976
- "learning_rate": 0.0,
977
- "loss": 1.8138,
978
- "step": 2860
979
- },
980
- {
981
- "epoch": 0.96,
982
- "learning_rate": 0.0,
983
- "loss": 1.8554,
984
- "step": 2880
985
- },
986
- {
987
- "epoch": 0.96,
988
- "learning_rate": 0.0,
989
- "loss": 1.8321,
990
- "step": 2900
991
- },
992
- {
993
- "epoch": 0.97,
994
- "learning_rate": 0.0,
995
- "loss": 1.8404,
996
- "step": 2920
997
- },
998
- {
999
- "epoch": 0.98,
1000
- "learning_rate": 0.0,
1001
- "loss": 1.7883,
1002
- "step": 2940
1003
- },
1004
- {
1005
- "epoch": 0.98,
1006
- "learning_rate": 0.0,
1007
- "loss": 1.8733,
1008
- "step": 2960
1009
- },
1010
- {
1011
- "epoch": 0.99,
1012
- "learning_rate": 0.0,
1013
- "loss": 1.8074,
1014
- "step": 2980
1015
- },
1016
- {
1017
- "epoch": 1.0,
1018
- "learning_rate": 0.0,
1019
- "loss": 1.887,
1020
- "step": 3000
1021
- },
1022
- {
1023
- "epoch": 1.0,
1024
- "eval_loss": 1.970503568649292,
1025
- "eval_runtime": 11974.5637,
1026
- "eval_samples_per_second": 1.116,
1027
- "eval_steps_per_second": 0.14,
1028
- "step": 3000
1029
- },
1030
- {
1031
- "epoch": 1.0,
1032
- "learning_rate": 0.0,
1033
- "loss": 1.7165,
1034
- "step": 3020
1035
- },
1036
- {
1037
- "epoch": 1.01,
1038
- "learning_rate": 0.0,
1039
- "loss": 1.8041,
1040
- "step": 3040
1041
- },
1042
- {
1043
- "epoch": 1.01,
1044
- "learning_rate": 0.0,
1045
- "loss": 1.8031,
1046
- "step": 3060
1047
- },
1048
- {
1049
- "epoch": 1.02,
1050
- "learning_rate": 0.0,
1051
- "loss": 1.8493,
1052
- "step": 3080
1053
- },
1054
- {
1055
- "epoch": 1.03,
1056
- "learning_rate": 0.0,
1057
- "loss": 1.8475,
1058
- "step": 3100
1059
- },
1060
- {
1061
- "epoch": 1.03,
1062
- "learning_rate": 0.0,
1063
- "loss": 1.7994,
1064
- "step": 3120
1065
- },
1066
- {
1067
- "epoch": 1.04,
1068
- "learning_rate": 0.0,
1069
- "loss": 1.7904,
1070
- "step": 3140
1071
- },
1072
- {
1073
- "epoch": 1.05,
1074
- "learning_rate": 0.0,
1075
- "loss": 1.8078,
1076
- "step": 3160
1077
- },
1078
- {
1079
- "epoch": 1.05,
1080
- "learning_rate": 0.0,
1081
- "loss": 1.8625,
1082
- "step": 3180
1083
- },
1084
- {
1085
- "epoch": 1.06,
1086
- "learning_rate": 0.0,
1087
- "loss": 1.7874,
1088
- "step": 3200
1089
- },
1090
- {
1091
- "epoch": 1.06,
1092
- "eval_loss": 1.970503568649292,
1093
- "eval_runtime": 11966.566,
1094
- "eval_samples_per_second": 1.117,
1095
- "eval_steps_per_second": 0.14,
1096
- "step": 3200
1097
- },
1098
- {
1099
- "epoch": 1.07,
1100
- "learning_rate": 0.0,
1101
- "loss": 1.7956,
1102
- "step": 3220
1103
- },
1104
- {
1105
- "epoch": 1.07,
1106
- "learning_rate": 0.0,
1107
- "loss": 1.7517,
1108
- "step": 3240
1109
- },
1110
- {
1111
- "epoch": 1.08,
1112
- "learning_rate": 0.0,
1113
- "loss": 1.7861,
1114
- "step": 3260
1115
- },
1116
- {
1117
- "epoch": 1.09,
1118
- "learning_rate": 0.0,
1119
- "loss": 1.7957,
1120
- "step": 3280
1121
- },
1122
- {
1123
- "epoch": 1.09,
1124
- "learning_rate": 0.0,
1125
- "loss": 1.8147,
1126
- "step": 3300
1127
- },
1128
- {
1129
- "epoch": 1.1,
1130
- "learning_rate": 0.0,
1131
- "loss": 1.7795,
1132
- "step": 3320
1133
- },
1134
- {
1135
- "epoch": 1.11,
1136
- "learning_rate": 0.0,
1137
- "loss": 1.7532,
1138
- "step": 3340
1139
- },
1140
- {
1141
- "epoch": 1.11,
1142
- "learning_rate": 0.0,
1143
- "loss": 1.8414,
1144
- "step": 3360
1145
- },
1146
- {
1147
- "epoch": 1.12,
1148
- "learning_rate": 0.0,
1149
- "loss": 1.771,
1150
- "step": 3380
1151
- },
1152
- {
1153
- "epoch": 1.13,
1154
- "learning_rate": 0.0,
1155
- "loss": 1.8119,
1156
- "step": 3400
1157
- },
1158
- {
1159
- "epoch": 1.13,
1160
- "eval_loss": 1.970503568649292,
1161
- "eval_runtime": 11954.7549,
1162
- "eval_samples_per_second": 1.118,
1163
- "eval_steps_per_second": 0.14,
1164
- "step": 3400
1165
- },
1166
- {
1167
- "epoch": 1.13,
1168
- "learning_rate": 0.0,
1169
- "loss": 1.7944,
1170
- "step": 3420
1171
- },
1172
- {
1173
- "epoch": 1.14,
1174
- "learning_rate": 0.0,
1175
- "loss": 1.7855,
1176
- "step": 3440
1177
- },
1178
- {
1179
- "epoch": 1.15,
1180
- "learning_rate": 0.0,
1181
- "loss": 1.8574,
1182
- "step": 3460
1183
- },
1184
- {
1185
- "epoch": 1.15,
1186
- "learning_rate": 0.0,
1187
- "loss": 1.8189,
1188
- "step": 3480
1189
- },
1190
- {
1191
- "epoch": 1.16,
1192
- "learning_rate": 0.0,
1193
- "loss": 1.8033,
1194
- "step": 3500
1195
- },
1196
- {
1197
- "epoch": 1.17,
1198
- "learning_rate": 0.0,
1199
- "loss": 1.7689,
1200
- "step": 3520
1201
- },
1202
- {
1203
- "epoch": 1.17,
1204
- "learning_rate": 0.0,
1205
- "loss": 1.8455,
1206
- "step": 3540
1207
- },
1208
- {
1209
- "epoch": 1.18,
1210
- "learning_rate": 0.0,
1211
- "loss": 1.8255,
1212
- "step": 3560
1213
- },
1214
- {
1215
- "epoch": 1.19,
1216
- "learning_rate": 0.0,
1217
- "loss": 1.8431,
1218
- "step": 3580
1219
- },
1220
- {
1221
- "epoch": 1.19,
1222
- "learning_rate": 0.0,
1223
- "loss": 1.7924,
1224
- "step": 3600
1225
- },
1226
- {
1227
- "epoch": 1.19,
1228
- "eval_loss": 1.970503568649292,
1229
- "eval_runtime": 11953.5885,
1230
- "eval_samples_per_second": 1.118,
1231
- "eval_steps_per_second": 0.14,
1232
- "step": 3600
1233
- },
1234
- {
1235
- "epoch": 1.2,
1236
- "learning_rate": 0.0,
1237
- "loss": 1.8475,
1238
- "step": 3620
1239
- },
1240
- {
1241
- "epoch": 1.21,
1242
- "learning_rate": 0.0,
1243
- "loss": 1.6746,
1244
- "step": 3640
1245
- },
1246
- {
1247
- "epoch": 1.21,
1248
- "learning_rate": 0.0,
1249
- "loss": 1.8006,
1250
- "step": 3660
1251
- },
1252
- {
1253
- "epoch": 1.22,
1254
- "learning_rate": 0.0,
1255
- "loss": 1.816,
1256
- "step": 3680
1257
- },
1258
- {
1259
- "epoch": 1.23,
1260
- "learning_rate": 0.0,
1261
- "loss": 1.7486,
1262
- "step": 3700
1263
- },
1264
- {
1265
- "epoch": 1.23,
1266
- "learning_rate": 0.0,
1267
- "loss": 1.8086,
1268
- "step": 3720
1269
- },
1270
- {
1271
- "epoch": 1.24,
1272
- "learning_rate": 0.0,
1273
- "loss": 1.8241,
1274
- "step": 3740
1275
- },
1276
- {
1277
- "epoch": 1.25,
1278
- "learning_rate": 0.0,
1279
- "loss": 1.823,
1280
- "step": 3760
1281
- },
1282
- {
1283
- "epoch": 1.25,
1284
- "learning_rate": 0.0,
1285
- "loss": 1.7365,
1286
- "step": 3780
1287
- },
1288
- {
1289
- "epoch": 1.26,
1290
- "learning_rate": 0.0,
1291
- "loss": 1.801,
1292
- "step": 3800
1293
- },
1294
- {
1295
- "epoch": 1.26,
1296
- "eval_loss": 1.970503568649292,
1297
- "eval_runtime": 11952.013,
1298
- "eval_samples_per_second": 1.118,
1299
- "eval_steps_per_second": 0.14,
1300
- "step": 3800
1301
- },
1302
- {
1303
- "epoch": 1.27,
1304
- "learning_rate": 0.0,
1305
- "loss": 1.8497,
1306
- "step": 3820
1307
- },
1308
- {
1309
- "epoch": 1.27,
1310
- "learning_rate": 0.0,
1311
- "loss": 1.7803,
1312
- "step": 3840
1313
- },
1314
- {
1315
- "epoch": 1.28,
1316
- "learning_rate": 0.0,
1317
- "loss": 1.6921,
1318
- "step": 3860
1319
- },
1320
- {
1321
- "epoch": 1.29,
1322
- "learning_rate": 0.0,
1323
- "loss": 1.8508,
1324
- "step": 3880
1325
- },
1326
- {
1327
- "epoch": 1.29,
1328
- "learning_rate": 0.0,
1329
- "loss": 1.8816,
1330
- "step": 3900
1331
- },
1332
- {
1333
- "epoch": 1.3,
1334
- "learning_rate": 0.0,
1335
- "loss": 1.8105,
1336
- "step": 3920
1337
- },
1338
- {
1339
- "epoch": 1.31,
1340
- "learning_rate": 0.0,
1341
- "loss": 1.8477,
1342
- "step": 3940
1343
- },
1344
- {
1345
- "epoch": 1.31,
1346
- "learning_rate": 0.0,
1347
- "loss": 1.841,
1348
- "step": 3960
1349
- },
1350
- {
1351
- "epoch": 1.32,
1352
- "learning_rate": 0.0,
1353
- "loss": 1.7599,
1354
- "step": 3980
1355
- },
1356
- {
1357
- "epoch": 1.33,
1358
- "learning_rate": 0.0,
1359
- "loss": 1.7074,
1360
- "step": 4000
1361
- },
1362
- {
1363
- "epoch": 1.33,
1364
- "eval_loss": 1.970503568649292,
1365
- "eval_runtime": 11952.3623,
1366
- "eval_samples_per_second": 1.118,
1367
- "eval_steps_per_second": 0.14,
1368
- "step": 4000
1369
- },
1370
- {
1371
- "epoch": 1.33,
1372
- "learning_rate": 0.0,
1373
- "loss": 1.7529,
1374
- "step": 4020
1375
- },
1376
- {
1377
- "epoch": 1.34,
1378
- "learning_rate": 0.0,
1379
- "loss": 1.8928,
1380
- "step": 4040
1381
- },
1382
- {
1383
- "epoch": 1.35,
1384
- "learning_rate": 0.0,
1385
- "loss": 1.8585,
1386
- "step": 4060
1387
- },
1388
- {
1389
- "epoch": 1.35,
1390
- "learning_rate": 0.0,
1391
- "loss": 1.8279,
1392
- "step": 4080
1393
- },
1394
- {
1395
- "epoch": 1.36,
1396
- "learning_rate": 0.0,
1397
- "loss": 1.7949,
1398
- "step": 4100
1399
- },
1400
- {
1401
- "epoch": 1.37,
1402
- "learning_rate": 0.0,
1403
- "loss": 1.8488,
1404
- "step": 4120
1405
- },
1406
- {
1407
- "epoch": 1.37,
1408
- "learning_rate": 0.0,
1409
- "loss": 1.7572,
1410
- "step": 4140
1411
- },
1412
- {
1413
- "epoch": 1.38,
1414
- "learning_rate": 0.0,
1415
- "loss": 1.7533,
1416
- "step": 4160
1417
- },
1418
- {
1419
- "epoch": 1.39,
1420
- "learning_rate": 0.0,
1421
- "loss": 1.772,
1422
- "step": 4180
1423
- },
1424
- {
1425
- "epoch": 1.39,
1426
- "learning_rate": 0.0,
1427
- "loss": 1.8338,
1428
- "step": 4200
1429
- },
1430
- {
1431
- "epoch": 1.39,
1432
- "eval_loss": 1.970503568649292,
1433
- "eval_runtime": 11954.2113,
1434
- "eval_samples_per_second": 1.118,
1435
- "eval_steps_per_second": 0.14,
1436
- "step": 4200
1437
- },
1438
- {
1439
- "epoch": 1.4,
1440
- "learning_rate": 0.0,
1441
- "loss": 1.8084,
1442
- "step": 4220
1443
- },
1444
- {
1445
- "epoch": 1.41,
1446
- "learning_rate": 0.0,
1447
- "loss": 1.8791,
1448
- "step": 4240
1449
- },
1450
- {
1451
- "epoch": 1.41,
1452
- "learning_rate": 0.0,
1453
- "loss": 1.7906,
1454
- "step": 4260
1455
- },
1456
- {
1457
- "epoch": 1.42,
1458
- "learning_rate": 0.0,
1459
- "loss": 1.8669,
1460
- "step": 4280
1461
- },
1462
- {
1463
- "epoch": 1.43,
1464
- "learning_rate": 0.0,
1465
- "loss": 1.8108,
1466
- "step": 4300
1467
- },
1468
- {
1469
- "epoch": 1.43,
1470
- "learning_rate": 0.0,
1471
- "loss": 1.7769,
1472
- "step": 4320
1473
- },
1474
- {
1475
- "epoch": 1.44,
1476
- "learning_rate": 0.0,
1477
- "loss": 1.7599,
1478
- "step": 4340
1479
- },
1480
- {
1481
- "epoch": 1.45,
1482
- "learning_rate": 0.0,
1483
- "loss": 1.843,
1484
- "step": 4360
1485
- },
1486
- {
1487
- "epoch": 1.45,
1488
- "learning_rate": 0.0,
1489
- "loss": 1.8608,
1490
- "step": 4380
1491
- },
1492
- {
1493
- "epoch": 1.46,
1494
- "learning_rate": 0.0,
1495
- "loss": 1.8382,
1496
- "step": 4400
1497
- },
1498
- {
1499
- "epoch": 1.46,
1500
- "eval_loss": 1.970503568649292,
1501
- "eval_runtime": 11958.7166,
1502
- "eval_samples_per_second": 1.118,
1503
- "eval_steps_per_second": 0.14,
1504
- "step": 4400
1505
- },
1506
- {
1507
- "epoch": 1.47,
1508
- "learning_rate": 0.0,
1509
- "loss": 1.8002,
1510
- "step": 4420
1511
- },
1512
- {
1513
- "epoch": 1.47,
1514
- "learning_rate": 0.0,
1515
- "loss": 1.8292,
1516
- "step": 4440
1517
- },
1518
- {
1519
- "epoch": 1.48,
1520
- "learning_rate": 0.0,
1521
- "loss": 1.8184,
1522
- "step": 4460
1523
- },
1524
- {
1525
- "epoch": 1.49,
1526
- "learning_rate": 0.0,
1527
- "loss": 1.7712,
1528
- "step": 4480
1529
- },
1530
- {
1531
- "epoch": 1.49,
1532
- "learning_rate": 0.0,
1533
- "loss": 1.8307,
1534
- "step": 4500
1535
- },
1536
- {
1537
- "epoch": 1.5,
1538
- "learning_rate": 0.0,
1539
- "loss": 1.7883,
1540
- "step": 4520
1541
- },
1542
- {
1543
- "epoch": 1.51,
1544
- "learning_rate": 0.0,
1545
- "loss": 1.8106,
1546
- "step": 4540
1547
- },
1548
- {
1549
- "epoch": 1.51,
1550
- "learning_rate": 0.0,
1551
- "loss": 1.7999,
1552
- "step": 4560
1553
- },
1554
- {
1555
- "epoch": 1.52,
1556
- "learning_rate": 0.0,
1557
- "loss": 1.8353,
1558
- "step": 4580
1559
- },
1560
- {
1561
- "epoch": 1.53,
1562
- "learning_rate": 0.0,
1563
- "loss": 1.8366,
1564
- "step": 4600
1565
- },
1566
- {
1567
- "epoch": 1.53,
1568
- "eval_loss": 1.970503568649292,
1569
- "eval_runtime": 11971.0491,
1570
- "eval_samples_per_second": 1.117,
1571
- "eval_steps_per_second": 0.14,
1572
- "step": 4600
1573
- },
1574
- {
1575
- "epoch": 1.53,
1576
- "learning_rate": 0.0,
1577
- "loss": 1.8018,
1578
- "step": 4620
1579
- },
1580
- {
1581
- "epoch": 1.54,
1582
- "learning_rate": 0.0,
1583
- "loss": 1.84,
1584
- "step": 4640
1585
- },
1586
- {
1587
- "epoch": 1.55,
1588
- "learning_rate": 0.0,
1589
- "loss": 1.7784,
1590
- "step": 4660
1591
- },
1592
- {
1593
- "epoch": 1.55,
1594
- "learning_rate": 0.0,
1595
- "loss": 1.794,
1596
- "step": 4680
1597
- },
1598
- {
1599
- "epoch": 1.56,
1600
- "learning_rate": 0.0,
1601
- "loss": 1.8237,
1602
- "step": 4700
1603
- },
1604
- {
1605
- "epoch": 1.57,
1606
- "learning_rate": 0.0,
1607
- "loss": 1.7697,
1608
- "step": 4720
1609
- },
1610
- {
1611
- "epoch": 1.57,
1612
- "learning_rate": 0.0,
1613
- "loss": 1.8482,
1614
- "step": 4740
1615
- },
1616
- {
1617
- "epoch": 1.58,
1618
- "learning_rate": 0.0,
1619
- "loss": 1.8008,
1620
- "step": 4760
1621
- },
1622
- {
1623
- "epoch": 1.59,
1624
- "learning_rate": 0.0,
1625
- "loss": 1.8082,
1626
- "step": 4780
1627
- },
1628
- {
1629
- "epoch": 1.59,
1630
- "learning_rate": 0.0,
1631
- "loss": 1.7799,
1632
- "step": 4800
1633
- },
1634
- {
1635
- "epoch": 1.59,
1636
- "eval_loss": 1.970503568649292,
1637
- "eval_runtime": 11955.9821,
1638
- "eval_samples_per_second": 1.118,
1639
- "eval_steps_per_second": 0.14,
1640
- "step": 4800
1641
- },
1642
- {
1643
- "epoch": 1.6,
1644
- "learning_rate": 0.0,
1645
- "loss": 1.8339,
1646
- "step": 4820
1647
- },
1648
- {
1649
- "epoch": 1.61,
1650
- "learning_rate": 0.0,
1651
- "loss": 1.8072,
1652
- "step": 4840
1653
- },
1654
- {
1655
- "epoch": 1.61,
1656
- "learning_rate": 0.0,
1657
- "loss": 1.8024,
1658
- "step": 4860
1659
- },
1660
- {
1661
- "epoch": 1.62,
1662
- "learning_rate": 0.0,
1663
- "loss": 1.8609,
1664
- "step": 4880
1665
- },
1666
- {
1667
- "epoch": 1.63,
1668
- "learning_rate": 0.0,
1669
- "loss": 1.8769,
1670
- "step": 4900
1671
- },
1672
- {
1673
- "epoch": 1.63,
1674
- "learning_rate": 0.0,
1675
- "loss": 1.808,
1676
- "step": 4920
1677
- },
1678
- {
1679
- "epoch": 1.64,
1680
- "learning_rate": 0.0,
1681
- "loss": 1.8482,
1682
- "step": 4940
1683
- },
1684
- {
1685
- "epoch": 1.65,
1686
- "learning_rate": 0.0,
1687
- "loss": 1.8116,
1688
- "step": 4960
1689
- },
1690
- {
1691
- "epoch": 1.65,
1692
- "learning_rate": 0.0,
1693
- "loss": 1.7922,
1694
- "step": 4980
1695
- },
1696
- {
1697
- "epoch": 1.66,
1698
- "learning_rate": 0.0,
1699
- "loss": 1.8409,
1700
- "step": 5000
1701
- },
1702
- {
1703
- "epoch": 1.66,
1704
- "eval_loss": 1.970503568649292,
1705
- "eval_runtime": 11953.6568,
1706
- "eval_samples_per_second": 1.118,
1707
- "eval_steps_per_second": 0.14,
1708
- "step": 5000
1709
- },
1710
- {
1711
- "epoch": 1.67,
1712
- "learning_rate": 0.0,
1713
- "loss": 1.867,
1714
- "step": 5020
1715
- },
1716
- {
1717
- "epoch": 1.67,
1718
- "learning_rate": 0.0,
1719
- "loss": 1.8902,
1720
- "step": 5040
1721
- },
1722
- {
1723
- "epoch": 1.68,
1724
- "learning_rate": 0.0,
1725
- "loss": 1.8835,
1726
- "step": 5060
1727
- },
1728
- {
1729
- "epoch": 1.68,
1730
- "learning_rate": 0.0,
1731
- "loss": 1.741,
1732
- "step": 5080
1733
- },
1734
- {
1735
- "epoch": 1.69,
1736
- "learning_rate": 0.0,
1737
- "loss": 1.7543,
1738
- "step": 5100
1739
- },
1740
- {
1741
- "epoch": 1.7,
1742
- "learning_rate": 0.0,
1743
- "loss": 1.7913,
1744
- "step": 5120
1745
- },
1746
- {
1747
- "epoch": 1.7,
1748
- "learning_rate": 0.0,
1749
- "loss": 1.829,
1750
- "step": 5140
1751
- },
1752
- {
1753
- "epoch": 1.71,
1754
- "learning_rate": 0.0,
1755
- "loss": 1.7915,
1756
- "step": 5160
1757
- },
1758
- {
1759
- "epoch": 1.72,
1760
- "learning_rate": 0.0,
1761
- "loss": 1.767,
1762
- "step": 5180
1763
- },
1764
- {
1765
- "epoch": 1.72,
1766
- "learning_rate": 0.0,
1767
- "loss": 1.7809,
1768
- "step": 5200
1769
- },
1770
- {
1771
- "epoch": 1.72,
1772
- "eval_loss": 1.970503568649292,
1773
- "eval_runtime": 11951.295,
1774
- "eval_samples_per_second": 1.119,
1775
- "eval_steps_per_second": 0.14,
1776
- "step": 5200
1777
- },
1778
- {
1779
- "epoch": 1.73,
1780
- "learning_rate": 0.0,
1781
- "loss": 1.8715,
1782
- "step": 5220
1783
- },
1784
- {
1785
- "epoch": 1.74,
1786
- "learning_rate": 0.0,
1787
- "loss": 1.8517,
1788
- "step": 5240
1789
- },
1790
- {
1791
- "epoch": 1.74,
1792
- "learning_rate": 0.0,
1793
- "loss": 1.8255,
1794
- "step": 5260
1795
- },
1796
- {
1797
- "epoch": 1.75,
1798
- "learning_rate": 0.0,
1799
- "loss": 1.7973,
1800
- "step": 5280
1801
- },
1802
- {
1803
- "epoch": 1.76,
1804
- "learning_rate": 0.0,
1805
- "loss": 1.7912,
1806
- "step": 5300
1807
- },
1808
- {
1809
- "epoch": 1.76,
1810
- "learning_rate": 0.0,
1811
- "loss": 1.8331,
1812
- "step": 5320
1813
- },
1814
- {
1815
- "epoch": 1.77,
1816
- "learning_rate": 0.0,
1817
- "loss": 1.835,
1818
- "step": 5340
1819
- },
1820
- {
1821
- "epoch": 1.78,
1822
- "learning_rate": 0.0,
1823
- "loss": 1.8051,
1824
- "step": 5360
1825
- },
1826
- {
1827
- "epoch": 1.78,
1828
- "learning_rate": 0.0,
1829
- "loss": 1.8007,
1830
- "step": 5380
1831
- },
1832
- {
1833
- "epoch": 1.79,
1834
- "learning_rate": 0.0,
1835
- "loss": 1.8069,
1836
- "step": 5400
1837
- },
1838
- {
1839
- "epoch": 1.79,
1840
- "eval_loss": 1.970503568649292,
1841
- "eval_runtime": 11948.5086,
1842
- "eval_samples_per_second": 1.119,
1843
- "eval_steps_per_second": 0.14,
1844
- "step": 5400
1845
- },
1846
- {
1847
- "epoch": 1.8,
1848
- "learning_rate": 0.0,
1849
- "loss": 1.784,
1850
- "step": 5420
1851
- },
1852
- {
1853
- "epoch": 1.8,
1854
- "learning_rate": 0.0,
1855
- "loss": 1.7557,
1856
- "step": 5440
1857
- },
1858
- {
1859
- "epoch": 1.81,
1860
- "learning_rate": 0.0,
1861
- "loss": 1.8011,
1862
- "step": 5460
1863
- },
1864
- {
1865
- "epoch": 1.82,
1866
- "learning_rate": 0.0,
1867
- "loss": 1.8392,
1868
- "step": 5480
1869
- },
1870
- {
1871
- "epoch": 1.82,
1872
- "learning_rate": 0.0,
1873
- "loss": 1.7507,
1874
- "step": 5500
1875
- },
1876
- {
1877
- "epoch": 1.83,
1878
- "learning_rate": 0.0,
1879
- "loss": 1.8043,
1880
- "step": 5520
1881
- },
1882
- {
1883
- "epoch": 1.84,
1884
- "learning_rate": 0.0,
1885
- "loss": 1.7804,
1886
- "step": 5540
1887
- },
1888
- {
1889
- "epoch": 1.84,
1890
- "learning_rate": 0.0,
1891
- "loss": 1.811,
1892
- "step": 5560
1893
- },
1894
- {
1895
- "epoch": 1.85,
1896
- "learning_rate": 0.0,
1897
- "loss": 1.806,
1898
- "step": 5580
1899
- },
1900
- {
1901
- "epoch": 1.86,
1902
- "learning_rate": 0.0,
1903
- "loss": 1.8103,
1904
- "step": 5600
1905
- },
1906
- {
1907
- "epoch": 1.86,
1908
- "eval_loss": 1.970503568649292,
1909
- "eval_runtime": 11953.5582,
1910
- "eval_samples_per_second": 1.118,
1911
- "eval_steps_per_second": 0.14,
1912
- "step": 5600
1913
- },
1914
- {
1915
- "epoch": 1.86,
1916
- "learning_rate": 0.0,
1917
- "loss": 1.7947,
1918
- "step": 5620
1919
- },
1920
- {
1921
- "epoch": 1.87,
1922
- "learning_rate": 0.0,
1923
- "loss": 1.8062,
1924
- "step": 5640
1925
- },
1926
- {
1927
- "epoch": 1.88,
1928
- "learning_rate": 0.0,
1929
- "loss": 1.7997,
1930
- "step": 5660
1931
- },
1932
- {
1933
- "epoch": 1.88,
1934
- "learning_rate": 0.0,
1935
- "loss": 1.843,
1936
- "step": 5680
1937
- },
1938
- {
1939
- "epoch": 1.89,
1940
- "learning_rate": 0.0,
1941
- "loss": 1.848,
1942
- "step": 5700
1943
- },
1944
- {
1945
- "epoch": 1.9,
1946
- "learning_rate": 0.0,
1947
- "loss": 1.8393,
1948
- "step": 5720
1949
- },
1950
- {
1951
- "epoch": 1.9,
1952
- "learning_rate": 0.0,
1953
- "loss": 1.8187,
1954
- "step": 5740
1955
- },
1956
- {
1957
- "epoch": 1.91,
1958
- "learning_rate": 0.0,
1959
- "loss": 1.802,
1960
- "step": 5760
1961
- },
1962
- {
1963
- "epoch": 1.92,
1964
- "learning_rate": 0.0,
1965
- "loss": 1.7646,
1966
- "step": 5780
1967
- },
1968
- {
1969
- "epoch": 1.92,
1970
- "learning_rate": 0.0,
1971
- "loss": 1.7993,
1972
- "step": 5800
1973
- },
1974
- {
1975
- "epoch": 1.92,
1976
- "eval_loss": 1.970503568649292,
1977
- "eval_runtime": 11953.0956,
1978
- "eval_samples_per_second": 1.118,
1979
- "eval_steps_per_second": 0.14,
1980
- "step": 5800
1981
- },
1982
- {
1983
- "epoch": 1.93,
1984
- "learning_rate": 0.0,
1985
- "loss": 1.7962,
1986
- "step": 5820
1987
- },
1988
- {
1989
- "epoch": 1.94,
1990
- "learning_rate": 0.0,
1991
- "loss": 1.849,
1992
- "step": 5840
1993
- },
1994
- {
1995
- "epoch": 1.94,
1996
- "learning_rate": 0.0,
1997
- "loss": 1.8254,
1998
- "step": 5860
1999
- },
2000
- {
2001
- "epoch": 1.95,
2002
- "learning_rate": 0.0,
2003
- "loss": 1.8583,
2004
- "step": 5880
2005
- },
2006
- {
2007
- "epoch": 1.96,
2008
- "learning_rate": 0.0,
2009
- "loss": 1.8398,
2010
- "step": 5900
2011
- },
2012
- {
2013
- "epoch": 1.96,
2014
- "learning_rate": 0.0,
2015
- "loss": 1.7306,
2016
- "step": 5920
2017
- },
2018
- {
2019
- "epoch": 1.97,
2020
- "learning_rate": 0.0,
2021
- "loss": 1.81,
2022
- "step": 5940
2023
- },
2024
- {
2025
- "epoch": 1.98,
2026
- "learning_rate": 0.0,
2027
- "loss": 1.7762,
2028
- "step": 5960
2029
- },
2030
- {
2031
- "epoch": 1.98,
2032
- "learning_rate": 0.0,
2033
- "loss": 1.8549,
2034
- "step": 5980
2035
- },
2036
- {
2037
- "epoch": 1.99,
2038
- "learning_rate": 0.0,
2039
- "loss": 1.84,
2040
- "step": 6000
2041
- },
2042
- {
2043
- "epoch": 1.99,
2044
- "eval_loss": 1.970503568649292,
2045
- "eval_runtime": 11950.3766,
2046
- "eval_samples_per_second": 1.119,
2047
- "eval_steps_per_second": 0.14,
2048
- "step": 6000
2049
- },
2050
- {
2051
- "epoch": 2.0,
2052
- "learning_rate": 0.0,
2053
- "loss": 1.7897,
2054
- "step": 6020
2055
- },
2056
- {
2057
- "epoch": 2.0,
2058
- "learning_rate": 0.0,
2059
- "loss": 1.7844,
2060
- "step": 6040
2061
- },
2062
- {
2063
- "epoch": 2.01,
2064
- "learning_rate": 0.0,
2065
- "loss": 1.8453,
2066
- "step": 6060
2067
- },
2068
- {
2069
- "epoch": 2.02,
2070
- "learning_rate": 0.0,
2071
- "loss": 1.8268,
2072
- "step": 6080
2073
- },
2074
- {
2075
- "epoch": 2.02,
2076
- "learning_rate": 0.0,
2077
- "loss": 1.7932,
2078
- "step": 6100
2079
- },
2080
- {
2081
- "epoch": 2.03,
2082
- "learning_rate": 0.0,
2083
- "loss": 1.8005,
2084
- "step": 6120
2085
- },
2086
- {
2087
- "epoch": 2.04,
2088
- "learning_rate": 0.0,
2089
- "loss": 1.773,
2090
- "step": 6140
2091
- },
2092
- {
2093
- "epoch": 2.04,
2094
- "learning_rate": 0.0,
2095
- "loss": 1.8029,
2096
- "step": 6160
2097
- },
2098
- {
2099
- "epoch": 2.05,
2100
- "learning_rate": 0.0,
2101
- "loss": 1.8283,
2102
- "step": 6180
2103
- },
2104
- {
2105
- "epoch": 2.06,
2106
- "learning_rate": 0.0,
2107
- "loss": 1.8167,
2108
- "step": 6200
2109
- },
2110
- {
2111
- "epoch": 2.06,
2112
- "eval_loss": 1.970503568649292,
2113
- "eval_runtime": 11950.9199,
2114
- "eval_samples_per_second": 1.119,
2115
- "eval_steps_per_second": 0.14,
2116
- "step": 6200
2117
- },
2118
- {
2119
- "epoch": 2.06,
2120
- "learning_rate": 0.0,
2121
- "loss": 1.8465,
2122
- "step": 6220
2123
- },
2124
- {
2125
- "epoch": 2.07,
2126
- "learning_rate": 0.0,
2127
- "loss": 1.8747,
2128
- "step": 6240
2129
- },
2130
- {
2131
- "epoch": 2.08,
2132
- "learning_rate": 0.0,
2133
- "loss": 1.8031,
2134
- "step": 6260
2135
- },
2136
- {
2137
- "epoch": 2.08,
2138
- "learning_rate": 0.0,
2139
- "loss": 1.8366,
2140
- "step": 6280
2141
- },
2142
- {
2143
- "epoch": 2.09,
2144
- "learning_rate": 0.0,
2145
- "loss": 1.7998,
2146
- "step": 6300
2147
- },
2148
- {
2149
- "epoch": 2.1,
2150
- "learning_rate": 0.0,
2151
- "loss": 1.8143,
2152
- "step": 6320
2153
- },
2154
- {
2155
- "epoch": 2.1,
2156
- "learning_rate": 0.0,
2157
- "loss": 1.8586,
2158
- "step": 6340
2159
- },
2160
- {
2161
- "epoch": 2.11,
2162
- "learning_rate": 0.0,
2163
- "loss": 1.836,
2164
- "step": 6360
2165
- },
2166
- {
2167
- "epoch": 2.12,
2168
- "learning_rate": 0.0,
2169
- "loss": 1.863,
2170
- "step": 6380
2171
- },
2172
- {
2173
- "epoch": 2.12,
2174
- "learning_rate": 0.0,
2175
- "loss": 1.7667,
2176
- "step": 6400
2177
- },
2178
- {
2179
- "epoch": 2.12,
2180
- "eval_loss": 1.970503568649292,
2181
- "eval_runtime": 11949.7143,
2182
- "eval_samples_per_second": 1.119,
2183
- "eval_steps_per_second": 0.14,
2184
- "step": 6400
2185
- },
2186
- {
2187
- "epoch": 2.13,
2188
- "learning_rate": 0.0,
2189
- "loss": 1.8187,
2190
- "step": 6420
2191
- },
2192
- {
2193
- "epoch": 2.14,
2194
- "learning_rate": 0.0,
2195
- "loss": 1.7883,
2196
- "step": 6440
2197
- },
2198
- {
2199
- "epoch": 2.14,
2200
- "learning_rate": 0.0,
2201
- "loss": 1.9299,
2202
- "step": 6460
2203
- },
2204
- {
2205
- "epoch": 2.15,
2206
- "learning_rate": 0.0,
2207
- "loss": 1.8286,
2208
- "step": 6480
2209
- },
2210
- {
2211
- "epoch": 2.16,
2212
- "learning_rate": 0.0,
2213
- "loss": 1.8181,
2214
- "step": 6500
2215
- },
2216
- {
2217
- "epoch": 2.16,
2218
- "learning_rate": 0.0,
2219
- "loss": 1.7856,
2220
- "step": 6520
2221
- },
2222
- {
2223
- "epoch": 2.17,
2224
- "learning_rate": 0.0,
2225
- "loss": 1.7192,
2226
- "step": 6540
2227
- },
2228
- {
2229
- "epoch": 2.18,
2230
- "learning_rate": 0.0,
2231
- "loss": 1.7715,
2232
- "step": 6560
2233
- },
2234
- {
2235
- "epoch": 2.18,
2236
- "learning_rate": 0.0,
2237
- "loss": 1.8359,
2238
- "step": 6580
2239
- },
2240
- {
2241
- "epoch": 2.19,
2242
- "learning_rate": 0.0,
2243
- "loss": 1.7989,
2244
- "step": 6600
2245
- },
2246
- {
2247
- "epoch": 2.19,
2248
- "eval_loss": 1.970503568649292,
2249
- "eval_runtime": 11949.8024,
2250
- "eval_samples_per_second": 1.119,
2251
- "eval_steps_per_second": 0.14,
2252
- "step": 6600
2253
- },
2254
- {
2255
- "epoch": 2.2,
2256
- "learning_rate": 0.0,
2257
- "loss": 1.7869,
2258
- "step": 6620
2259
- },
2260
- {
2261
- "epoch": 2.2,
2262
- "learning_rate": 0.0,
2263
- "loss": 1.7855,
2264
- "step": 6640
2265
- },
2266
- {
2267
- "epoch": 2.21,
2268
- "learning_rate": 0.0,
2269
- "loss": 1.8312,
2270
- "step": 6660
2271
- },
2272
- {
2273
- "epoch": 2.22,
2274
- "learning_rate": 0.0,
2275
- "loss": 1.7717,
2276
- "step": 6680
2277
- },
2278
- {
2279
- "epoch": 2.22,
2280
- "learning_rate": 0.0,
2281
- "loss": 1.7621,
2282
- "step": 6700
2283
- },
2284
- {
2285
- "epoch": 2.23,
2286
- "learning_rate": 0.0,
2287
- "loss": 1.8031,
2288
- "step": 6720
2289
- },
2290
- {
2291
- "epoch": 2.24,
2292
- "learning_rate": 0.0,
2293
- "loss": 1.8061,
2294
- "step": 6740
2295
- },
2296
- {
2297
- "epoch": 2.24,
2298
- "learning_rate": 0.0,
2299
- "loss": 1.8063,
2300
- "step": 6760
2301
- },
2302
- {
2303
- "epoch": 2.25,
2304
- "learning_rate": 0.0,
2305
- "loss": 1.7469,
2306
- "step": 6780
2307
- },
2308
- {
2309
- "epoch": 2.26,
2310
- "learning_rate": 0.0,
2311
- "loss": 1.7543,
2312
- "step": 6800
2313
- },
2314
- {
2315
- "epoch": 2.26,
2316
- "eval_loss": 1.970503568649292,
2317
- "eval_runtime": 11954.2042,
2318
- "eval_samples_per_second": 1.118,
2319
- "eval_steps_per_second": 0.14,
2320
- "step": 6800
2321
- },
2322
- {
2323
- "epoch": 2.26,
2324
- "learning_rate": 0.0,
2325
- "loss": 1.8659,
2326
- "step": 6820
2327
- },
2328
- {
2329
- "epoch": 2.27,
2330
- "learning_rate": 0.0,
2331
- "loss": 1.7935,
2332
- "step": 6840
2333
- },
2334
- {
2335
- "epoch": 2.28,
2336
- "learning_rate": 0.0,
2337
- "loss": 1.8247,
2338
- "step": 6860
2339
- },
2340
- {
2341
- "epoch": 2.28,
2342
- "learning_rate": 0.0,
2343
- "loss": 1.8339,
2344
- "step": 6880
2345
- },
2346
- {
2347
- "epoch": 2.29,
2348
- "learning_rate": 0.0,
2349
- "loss": 1.8419,
2350
- "step": 6900
2351
- },
2352
- {
2353
- "epoch": 2.3,
2354
- "learning_rate": 0.0,
2355
- "loss": 1.7978,
2356
- "step": 6920
2357
- },
2358
- {
2359
- "epoch": 2.3,
2360
- "learning_rate": 0.0,
2361
- "loss": 1.8048,
2362
- "step": 6940
2363
- },
2364
- {
2365
- "epoch": 2.31,
2366
- "learning_rate": 0.0,
2367
- "loss": 1.8523,
2368
- "step": 6960
2369
- },
2370
- {
2371
- "epoch": 2.32,
2372
- "learning_rate": 0.0,
2373
- "loss": 1.7875,
2374
- "step": 6980
2375
- },
2376
- {
2377
- "epoch": 2.32,
2378
- "learning_rate": 0.0,
2379
- "loss": 1.8262,
2380
- "step": 7000
2381
- },
2382
- {
2383
- "epoch": 2.32,
2384
- "eval_loss": 1.970503568649292,
2385
- "eval_runtime": 11967.8262,
2386
- "eval_samples_per_second": 1.117,
2387
- "eval_steps_per_second": 0.14,
2388
- "step": 7000
2389
- },
2390
- {
2391
- "epoch": 2.33,
2392
- "learning_rate": 0.0,
2393
- "loss": 1.8661,
2394
- "step": 7020
2395
- },
2396
- {
2397
- "epoch": 2.33,
2398
- "learning_rate": 0.0,
2399
- "loss": 1.8136,
2400
- "step": 7040
2401
- },
2402
- {
2403
- "epoch": 2.34,
2404
- "learning_rate": 0.0,
2405
- "loss": 1.8066,
2406
- "step": 7060
2407
- },
2408
- {
2409
- "epoch": 2.35,
2410
- "learning_rate": 0.0,
2411
- "loss": 1.8355,
2412
- "step": 7080
2413
- },
2414
- {
2415
- "epoch": 2.35,
2416
- "learning_rate": 0.0,
2417
- "loss": 1.7598,
2418
- "step": 7100
2419
- },
2420
- {
2421
- "epoch": 2.36,
2422
- "learning_rate": 0.0,
2423
- "loss": 1.8384,
2424
- "step": 7120
2425
- },
2426
- {
2427
- "epoch": 2.37,
2428
- "learning_rate": 0.0,
2429
- "loss": 1.7768,
2430
- "step": 7140
2431
- },
2432
- {
2433
- "epoch": 2.37,
2434
- "learning_rate": 0.0,
2435
- "loss": 1.8371,
2436
- "step": 7160
2437
- },
2438
- {
2439
- "epoch": 2.38,
2440
- "learning_rate": 0.0,
2441
- "loss": 1.7989,
2442
- "step": 7180
2443
- },
2444
- {
2445
- "epoch": 2.39,
2446
- "learning_rate": 0.0,
2447
- "loss": 1.8204,
2448
- "step": 7200
2449
- },
2450
- {
2451
- "epoch": 2.39,
2452
- "eval_loss": 1.970503568649292,
2453
- "eval_runtime": 11959.9088,
2454
- "eval_samples_per_second": 1.118,
2455
- "eval_steps_per_second": 0.14,
2456
- "step": 7200
2457
- },
2458
- {
2459
- "epoch": 2.39,
2460
- "learning_rate": 0.0,
2461
- "loss": 1.8632,
2462
- "step": 7220
2463
- },
2464
- {
2465
- "epoch": 2.4,
2466
- "learning_rate": 0.0,
2467
- "loss": 1.838,
2468
- "step": 7240
2469
- },
2470
- {
2471
- "epoch": 2.41,
2472
- "learning_rate": 0.0,
2473
- "loss": 1.8492,
2474
- "step": 7260
2475
- },
2476
- {
2477
- "epoch": 2.41,
2478
- "learning_rate": 0.0,
2479
- "loss": 1.8213,
2480
- "step": 7280
2481
- },
2482
- {
2483
- "epoch": 2.42,
2484
- "learning_rate": 0.0,
2485
- "loss": 1.7367,
2486
- "step": 7300
2487
- },
2488
- {
2489
- "epoch": 2.43,
2490
- "learning_rate": 0.0,
2491
- "loss": 1.9046,
2492
- "step": 7320
2493
- },
2494
- {
2495
- "epoch": 2.43,
2496
- "learning_rate": 0.0,
2497
- "loss": 1.7799,
2498
- "step": 7340
2499
- },
2500
- {
2501
- "epoch": 2.44,
2502
- "learning_rate": 0.0,
2503
- "loss": 1.793,
2504
- "step": 7360
2505
- },
2506
- {
2507
- "epoch": 2.45,
2508
- "learning_rate": 0.0,
2509
- "loss": 1.7864,
2510
- "step": 7380
2511
- },
2512
- {
2513
- "epoch": 2.45,
2514
- "learning_rate": 0.0,
2515
- "loss": 1.8071,
2516
- "step": 7400
2517
- },
2518
- {
2519
- "epoch": 2.45,
2520
- "eval_loss": 1.970503568649292,
2521
- "eval_runtime": 11949.9374,
2522
- "eval_samples_per_second": 1.119,
2523
- "eval_steps_per_second": 0.14,
2524
- "step": 7400
2525
- },
2526
- {
2527
- "epoch": 2.46,
2528
- "learning_rate": 0.0,
2529
- "loss": 1.7708,
2530
- "step": 7420
2531
- },
2532
- {
2533
- "epoch": 2.47,
2534
- "learning_rate": 0.0,
2535
- "loss": 1.8234,
2536
- "step": 7440
2537
- },
2538
- {
2539
- "epoch": 2.47,
2540
- "learning_rate": 0.0,
2541
- "loss": 1.8214,
2542
- "step": 7460
2543
- },
2544
- {
2545
- "epoch": 2.48,
2546
- "learning_rate": 0.0,
2547
- "loss": 1.8468,
2548
- "step": 7480
2549
- },
2550
- {
2551
- "epoch": 2.49,
2552
- "learning_rate": 0.0,
2553
- "loss": 1.784,
2554
- "step": 7500
2555
- },
2556
- {
2557
- "epoch": 2.49,
2558
- "learning_rate": 0.0,
2559
- "loss": 1.8571,
2560
- "step": 7520
2561
- },
2562
- {
2563
- "epoch": 2.5,
2564
- "learning_rate": 0.0,
2565
- "loss": 1.7871,
2566
- "step": 7540
2567
- },
2568
- {
2569
- "epoch": 2.51,
2570
- "learning_rate": 0.0,
2571
- "loss": 1.7819,
2572
- "step": 7560
2573
- },
2574
- {
2575
- "epoch": 2.51,
2576
- "learning_rate": 0.0,
2577
- "loss": 1.8035,
2578
- "step": 7580
2579
- },
2580
- {
2581
- "epoch": 2.52,
2582
- "learning_rate": 0.0,
2583
- "loss": 1.801,
2584
- "step": 7600
2585
- },
2586
- {
2587
- "epoch": 2.52,
2588
- "eval_loss": 1.970503568649292,
2589
- "eval_runtime": 11949.3183,
2590
- "eval_samples_per_second": 1.119,
2591
- "eval_steps_per_second": 0.14,
2592
- "step": 7600
2593
- },
2594
- {
2595
- "epoch": 2.53,
2596
- "learning_rate": 0.0,
2597
- "loss": 1.8302,
2598
- "step": 7620
2599
- },
2600
- {
2601
- "epoch": 2.53,
2602
- "learning_rate": 0.0,
2603
- "loss": 1.8208,
2604
- "step": 7640
2605
- },
2606
- {
2607
- "epoch": 2.54,
2608
- "learning_rate": 0.0,
2609
- "loss": 1.917,
2610
- "step": 7660
2611
- },
2612
- {
2613
- "epoch": 2.55,
2614
- "learning_rate": 0.0,
2615
- "loss": 1.8184,
2616
- "step": 7680
2617
- },
2618
- {
2619
- "epoch": 2.55,
2620
- "learning_rate": 0.0,
2621
- "loss": 1.7462,
2622
- "step": 7700
2623
- },
2624
- {
2625
- "epoch": 2.56,
2626
- "learning_rate": 0.0,
2627
- "loss": 1.7699,
2628
- "step": 7720
2629
- },
2630
- {
2631
- "epoch": 2.57,
2632
- "learning_rate": 0.0,
2633
- "loss": 1.8377,
2634
- "step": 7740
2635
- },
2636
- {
2637
- "epoch": 2.57,
2638
- "learning_rate": 0.0,
2639
- "loss": 1.7713,
2640
- "step": 7760
2641
- },
2642
- {
2643
- "epoch": 2.58,
2644
- "learning_rate": 0.0,
2645
- "loss": 1.798,
2646
- "step": 7780
2647
- },
2648
- {
2649
- "epoch": 2.59,
2650
- "learning_rate": 0.0,
2651
- "loss": 1.8425,
2652
- "step": 7800
2653
- },
2654
- {
2655
- "epoch": 2.59,
2656
- "eval_loss": 1.970503568649292,
2657
- "eval_runtime": 11947.7265,
2658
- "eval_samples_per_second": 1.119,
2659
- "eval_steps_per_second": 0.14,
2660
- "step": 7800
2661
- },
2662
- {
2663
- "epoch": 2.59,
2664
- "learning_rate": 0.0,
2665
- "loss": 1.7816,
2666
- "step": 7820
2667
- },
2668
- {
2669
- "epoch": 2.6,
2670
- "learning_rate": 0.0,
2671
- "loss": 1.8193,
2672
- "step": 7840
2673
- },
2674
- {
2675
- "epoch": 2.61,
2676
- "learning_rate": 0.0,
2677
- "loss": 1.8044,
2678
- "step": 7860
2679
- },
2680
- {
2681
- "epoch": 2.61,
2682
- "learning_rate": 0.0,
2683
- "loss": 1.7937,
2684
- "step": 7880
2685
- },
2686
- {
2687
- "epoch": 2.62,
2688
- "learning_rate": 0.0,
2689
- "loss": 1.8224,
2690
- "step": 7900
2691
- },
2692
- {
2693
- "epoch": 2.63,
2694
- "learning_rate": 0.0,
2695
- "loss": 1.7937,
2696
- "step": 7920
2697
- },
2698
- {
2699
- "epoch": 2.63,
2700
- "learning_rate": 0.0,
2701
- "loss": 1.7988,
2702
- "step": 7940
2703
- },
2704
- {
2705
- "epoch": 2.64,
2706
- "learning_rate": 0.0,
2707
- "loss": 1.8622,
2708
- "step": 7960
2709
- },
2710
- {
2711
- "epoch": 2.65,
2712
- "learning_rate": 0.0,
2713
- "loss": 1.8407,
2714
- "step": 7980
2715
- },
2716
- {
2717
- "epoch": 2.65,
2718
- "learning_rate": 0.0,
2719
- "loss": 1.7578,
2720
- "step": 8000
2721
- },
2722
- {
2723
- "epoch": 2.65,
2724
- "eval_loss": 1.970503568649292,
2725
- "eval_runtime": 11948.4274,
2726
- "eval_samples_per_second": 1.119,
2727
- "eval_steps_per_second": 0.14,
2728
- "step": 8000
2729
- },
2730
- {
2731
- "epoch": 2.66,
2732
- "learning_rate": 0.0,
2733
- "loss": 1.789,
2734
- "step": 8020
2735
- },
2736
- {
2737
- "epoch": 2.67,
2738
- "learning_rate": 0.0,
2739
- "loss": 1.7939,
2740
- "step": 8040
2741
- },
2742
- {
2743
- "epoch": 2.67,
2744
- "learning_rate": 0.0,
2745
- "loss": 1.7694,
2746
- "step": 8060
2747
- },
2748
- {
2749
- "epoch": 2.68,
2750
- "learning_rate": 0.0,
2751
- "loss": 1.7917,
2752
- "step": 8080
2753
- },
2754
- {
2755
- "epoch": 2.69,
2756
- "learning_rate": 0.0,
2757
- "loss": 1.7737,
2758
- "step": 8100
2759
- },
2760
- {
2761
- "epoch": 2.69,
2762
- "learning_rate": 0.0,
2763
- "loss": 1.8049,
2764
- "step": 8120
2765
- },
2766
- {
2767
- "epoch": 2.7,
2768
- "learning_rate": 0.0,
2769
- "loss": 1.8419,
2770
- "step": 8140
2771
- },
2772
- {
2773
- "epoch": 2.71,
2774
- "learning_rate": 0.0,
2775
- "loss": 1.7835,
2776
- "step": 8160
2777
- },
2778
- {
2779
- "epoch": 2.71,
2780
- "learning_rate": 0.0,
2781
- "loss": 1.7898,
2782
- "step": 8180
2783
- },
2784
- {
2785
- "epoch": 2.72,
2786
- "learning_rate": 0.0,
2787
- "loss": 1.7798,
2788
- "step": 8200
2789
- },
2790
- {
2791
- "epoch": 2.72,
2792
- "eval_loss": 1.970503568649292,
2793
- "eval_runtime": 11959.3563,
2794
- "eval_samples_per_second": 1.118,
2795
- "eval_steps_per_second": 0.14,
2796
- "step": 8200
2797
- },
2798
- {
2799
- "epoch": 2.73,
2800
- "learning_rate": 0.0,
2801
- "loss": 1.7904,
2802
- "step": 8220
2803
- },
2804
- {
2805
- "epoch": 2.73,
2806
- "learning_rate": 0.0,
2807
- "loss": 1.8054,
2808
- "step": 8240
2809
- },
2810
- {
2811
- "epoch": 2.74,
2812
- "learning_rate": 0.0,
2813
- "loss": 1.8316,
2814
- "step": 8260
2815
- },
2816
- {
2817
- "epoch": 2.75,
2818
- "learning_rate": 0.0,
2819
- "loss": 1.8059,
2820
- "step": 8280
2821
- },
2822
- {
2823
- "epoch": 2.75,
2824
- "learning_rate": 0.0,
2825
- "loss": 1.7908,
2826
- "step": 8300
2827
- },
2828
- {
2829
- "epoch": 2.76,
2830
- "learning_rate": 0.0,
2831
- "loss": 1.8092,
2832
- "step": 8320
2833
- },
2834
- {
2835
- "epoch": 2.77,
2836
- "learning_rate": 0.0,
2837
- "loss": 1.8954,
2838
- "step": 8340
2839
- },
2840
- {
2841
- "epoch": 2.77,
2842
- "learning_rate": 0.0,
2843
- "loss": 1.835,
2844
- "step": 8360
2845
- },
2846
- {
2847
- "epoch": 2.78,
2848
- "learning_rate": 0.0,
2849
- "loss": 1.8018,
2850
- "step": 8380
2851
- },
2852
- {
2853
- "epoch": 2.79,
2854
- "learning_rate": 0.0,
2855
- "loss": 1.8474,
2856
- "step": 8400
2857
- },
2858
- {
2859
- "epoch": 2.79,
2860
- "eval_loss": 1.970503568649292,
2861
- "eval_runtime": 11978.111,
2862
- "eval_samples_per_second": 1.116,
2863
- "eval_steps_per_second": 0.14,
2864
- "step": 8400
2865
- },
2866
- {
2867
- "epoch": 2.79,
2868
- "learning_rate": 0.0,
2869
- "loss": 1.7878,
2870
- "step": 8420
2871
- },
2872
- {
2873
- "epoch": 2.8,
2874
- "learning_rate": 0.0,
2875
- "loss": 1.7629,
2876
- "step": 8440
2877
- },
2878
- {
2879
- "epoch": 2.81,
2880
- "learning_rate": 0.0,
2881
- "loss": 1.8068,
2882
- "step": 8460
2883
- },
2884
- {
2885
- "epoch": 2.81,
2886
- "learning_rate": 0.0,
2887
- "loss": 1.7907,
2888
- "step": 8480
2889
- },
2890
- {
2891
- "epoch": 2.82,
2892
- "learning_rate": 0.0,
2893
- "loss": 1.7598,
2894
- "step": 8500
2895
- },
2896
- {
2897
- "epoch": 2.83,
2898
- "learning_rate": 0.0,
2899
- "loss": 1.7964,
2900
- "step": 8520
2901
- },
2902
- {
2903
- "epoch": 2.83,
2904
- "learning_rate": 0.0,
2905
- "loss": 1.7951,
2906
- "step": 8540
2907
- },
2908
- {
2909
- "epoch": 2.84,
2910
- "learning_rate": 0.0,
2911
- "loss": 1.7962,
2912
- "step": 8560
2913
- },
2914
- {
2915
- "epoch": 2.85,
2916
- "learning_rate": 0.0,
2917
- "loss": 1.7908,
2918
- "step": 8580
2919
- },
2920
- {
2921
- "epoch": 2.85,
2922
- "learning_rate": 0.0,
2923
- "loss": 1.7525,
2924
- "step": 8600
2925
- },
2926
- {
2927
- "epoch": 2.85,
2928
- "eval_loss": 1.970503568649292,
2929
- "eval_runtime": 11977.8941,
2930
- "eval_samples_per_second": 1.116,
2931
- "eval_steps_per_second": 0.14,
2932
- "step": 8600
2933
- },
2934
- {
2935
- "epoch": 2.86,
2936
- "learning_rate": 0.0,
2937
- "loss": 1.7268,
2938
- "step": 8620
2939
- },
2940
- {
2941
- "epoch": 2.87,
2942
- "learning_rate": 0.0,
2943
- "loss": 1.7716,
2944
- "step": 8640
2945
- },
2946
- {
2947
- "epoch": 2.87,
2948
- "learning_rate": 0.0,
2949
- "loss": 1.8214,
2950
- "step": 8660
2951
- },
2952
- {
2953
- "epoch": 2.88,
2954
- "learning_rate": 0.0,
2955
- "loss": 1.8116,
2956
- "step": 8680
2957
- },
2958
- {
2959
- "epoch": 2.89,
2960
- "learning_rate": 0.0,
2961
- "loss": 1.8204,
2962
- "step": 8700
2963
- },
2964
- {
2965
- "epoch": 2.89,
2966
- "learning_rate": 0.0,
2967
- "loss": 1.7878,
2968
- "step": 8720
2969
- },
2970
- {
2971
- "epoch": 2.9,
2972
- "learning_rate": 0.0,
2973
- "loss": 1.8828,
2974
- "step": 8740
2975
- },
2976
- {
2977
- "epoch": 2.91,
2978
- "learning_rate": 0.0,
2979
- "loss": 1.8015,
2980
- "step": 8760
2981
- },
2982
- {
2983
- "epoch": 2.91,
2984
- "learning_rate": 0.0,
2985
- "loss": 1.7989,
2986
- "step": 8780
2987
- },
2988
- {
2989
- "epoch": 2.92,
2990
- "learning_rate": 0.0,
2991
- "loss": 1.7467,
2992
- "step": 8800
2993
- },
2994
- {
2995
- "epoch": 2.92,
2996
- "eval_loss": 1.970503568649292,
2997
- "eval_runtime": 11960.8065,
2998
- "eval_samples_per_second": 1.118,
2999
- "eval_steps_per_second": 0.14,
3000
- "step": 8800
3001
- }
3002
- ],
3003
- "max_steps": 9045,
3004
- "num_train_epochs": 3,
3005
- "total_flos": 5.4879340491679334e+17,
3006
- "trial_name": null,
3007
- "trial_params": null
3008
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-8800/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5a78612ea8930d68eba4cb53d62254ccf547582e754aa049d169c3c11dd5fe4
3
- size 4027
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/adapter_config.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "base_model_name_or_path": "/domino/edv/afs-mrmc-data-store-rw/innovation/hf/RedPajama-INCITE-7B-Base",
3
- "bias": "none",
4
- "fan_in_fan_out": false,
5
- "inference_mode": true,
6
- "init_lora_weights": true,
7
- "lora_alpha": 16,
8
- "lora_dropout": 0.05,
9
- "modules_to_save": null,
10
- "peft_type": "LORA",
11
- "r": 8,
12
- "target_modules": [
13
- "query_key_value"
14
- ],
15
- "task_type": "CAUSAL_LM"
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:36c95157caaa0e8a49516175a38f9e3dfad6634df6c2d7fc47b2e3298cf4d68e
3
- size 16800753
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3c62c46fbec701f73b9c0d2af49122155f2c1654a0d513a7842e367f0516218
3
- size 33592261
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d6e839a14f185d11ef5c6540bd37eb7bad7288947ab9f31a50d914db4272e23
3
- size 14575
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:471be078058bbf990ff963153250176cf0c4eef4d5abd363e271020185c5758c
3
- size 627
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/trainer_state.json DELETED
@@ -1,3076 +0,0 @@
1
- {
2
- "best_metric": 1.9635850191116333,
3
- "best_model_checkpoint": "./results/redpj7B-lora-cnn-dailymail-results_fine_tune_test/checkpoint-600",
4
- "epoch": 2.9850746268656714,
5
- "global_step": 9000,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 0.00027082228116710874,
13
- "loss": 1.8654,
14
- "step": 20
15
- },
16
- {
17
- "epoch": 0.01,
18
- "learning_rate": 0.0002681697612732095,
19
- "loss": 1.8124,
20
- "step": 40
21
- },
22
- {
23
- "epoch": 0.02,
24
- "learning_rate": 0.0002655172413793103,
25
- "loss": 1.8231,
26
- "step": 60
27
- },
28
- {
29
- "epoch": 0.03,
30
- "learning_rate": 0.0002628647214854111,
31
- "loss": 1.7818,
32
- "step": 80
33
- },
34
- {
35
- "epoch": 0.03,
36
- "learning_rate": 0.00026021220159151194,
37
- "loss": 1.8446,
38
- "step": 100
39
- },
40
- {
41
- "epoch": 0.04,
42
- "learning_rate": 0.0002575596816976127,
43
- "loss": 1.9001,
44
- "step": 120
45
- },
46
- {
47
- "epoch": 0.05,
48
- "learning_rate": 0.0002549071618037135,
49
- "loss": 1.8073,
50
- "step": 140
51
- },
52
- {
53
- "epoch": 0.05,
54
- "learning_rate": 0.0002522546419098143,
55
- "loss": 1.8506,
56
- "step": 160
57
- },
58
- {
59
- "epoch": 0.06,
60
- "learning_rate": 0.0002496021220159151,
61
- "loss": 1.8471,
62
- "step": 180
63
- },
64
- {
65
- "epoch": 0.07,
66
- "learning_rate": 0.0002469496021220159,
67
- "loss": 1.8647,
68
- "step": 200
69
- },
70
- {
71
- "epoch": 0.07,
72
- "eval_loss": 1.966022253036499,
73
- "eval_runtime": 12002.3952,
74
- "eval_samples_per_second": 1.114,
75
- "eval_steps_per_second": 0.139,
76
- "step": 200
77
- },
78
- {
79
- "epoch": 0.07,
80
- "learning_rate": 0.00024429708222811666,
81
- "loss": 1.8578,
82
- "step": 220
83
- },
84
- {
85
- "epoch": 0.08,
86
- "learning_rate": 0.0002416445623342175,
87
- "loss": 1.8329,
88
- "step": 240
89
- },
90
- {
91
- "epoch": 0.09,
92
- "learning_rate": 0.0002389920424403183,
93
- "loss": 1.8119,
94
- "step": 260
95
- },
96
- {
97
- "epoch": 0.09,
98
- "learning_rate": 0.0002363395225464191,
99
- "loss": 1.8884,
100
- "step": 280
101
- },
102
- {
103
- "epoch": 0.1,
104
- "learning_rate": 0.00023368700265251986,
105
- "loss": 1.9077,
106
- "step": 300
107
- },
108
- {
109
- "epoch": 0.11,
110
- "learning_rate": 0.00023103448275862065,
111
- "loss": 1.8092,
112
- "step": 320
113
- },
114
- {
115
- "epoch": 0.11,
116
- "learning_rate": 0.00022838196286472146,
117
- "loss": 1.9237,
118
- "step": 340
119
- },
120
- {
121
- "epoch": 0.12,
122
- "learning_rate": 0.00022572944297082225,
123
- "loss": 1.8701,
124
- "step": 360
125
- },
126
- {
127
- "epoch": 0.13,
128
- "learning_rate": 0.00022307692307692306,
129
- "loss": 1.8933,
130
- "step": 380
131
- },
132
- {
133
- "epoch": 0.13,
134
- "learning_rate": 0.00022042440318302385,
135
- "loss": 1.8453,
136
- "step": 400
137
- },
138
- {
139
- "epoch": 0.13,
140
- "eval_loss": 1.9667036533355713,
141
- "eval_runtime": 11991.8036,
142
- "eval_samples_per_second": 1.115,
143
- "eval_steps_per_second": 0.139,
144
- "step": 400
145
- },
146
- {
147
- "epoch": 0.14,
148
- "learning_rate": 0.00021777188328912466,
149
- "loss": 1.8272,
150
- "step": 420
151
- },
152
- {
153
- "epoch": 0.15,
154
- "learning_rate": 0.00021511936339522545,
155
- "loss": 1.8149,
156
- "step": 440
157
- },
158
- {
159
- "epoch": 0.15,
160
- "learning_rate": 0.00021246684350132626,
161
- "loss": 1.8565,
162
- "step": 460
163
- },
164
- {
165
- "epoch": 0.16,
166
- "learning_rate": 0.00020981432360742705,
167
- "loss": 1.8139,
168
- "step": 480
169
- },
170
- {
171
- "epoch": 0.17,
172
- "learning_rate": 0.0002071618037135278,
173
- "loss": 1.802,
174
- "step": 500
175
- },
176
- {
177
- "epoch": 0.17,
178
- "learning_rate": 0.00020450928381962862,
179
- "loss": 1.8795,
180
- "step": 520
181
- },
182
- {
183
- "epoch": 0.18,
184
- "learning_rate": 0.0002018567639257294,
185
- "loss": 1.8621,
186
- "step": 540
187
- },
188
- {
189
- "epoch": 0.19,
190
- "learning_rate": 0.00019920424403183022,
191
- "loss": 1.8363,
192
- "step": 560
193
- },
194
- {
195
- "epoch": 0.19,
196
- "learning_rate": 0.000196551724137931,
197
- "loss": 1.8746,
198
- "step": 580
199
- },
200
- {
201
- "epoch": 0.2,
202
- "learning_rate": 0.00019389920424403182,
203
- "loss": 1.7763,
204
- "step": 600
205
- },
206
- {
207
- "epoch": 0.2,
208
- "eval_loss": 1.9635850191116333,
209
- "eval_runtime": 11976.5234,
210
- "eval_samples_per_second": 1.116,
211
- "eval_steps_per_second": 0.14,
212
- "step": 600
213
- },
214
- {
215
- "epoch": 0.21,
216
- "learning_rate": 0.0001912466843501326,
217
- "loss": 1.8423,
218
- "step": 620
219
- },
220
- {
221
- "epoch": 0.21,
222
- "learning_rate": 0.00018859416445623343,
223
- "loss": 1.8998,
224
- "step": 640
225
- },
226
- {
227
- "epoch": 0.22,
228
- "learning_rate": 0.0001859416445623342,
229
- "loss": 1.8131,
230
- "step": 660
231
- },
232
- {
233
- "epoch": 0.23,
234
- "learning_rate": 0.00018328912466843497,
235
- "loss": 1.8656,
236
- "step": 680
237
- },
238
- {
239
- "epoch": 0.23,
240
- "learning_rate": 0.00018063660477453579,
241
- "loss": 1.8181,
242
- "step": 700
243
- },
244
- {
245
- "epoch": 0.24,
246
- "learning_rate": 0.00017798408488063657,
247
- "loss": 1.8425,
248
- "step": 720
249
- },
250
- {
251
- "epoch": 0.25,
252
- "learning_rate": 0.00017533156498673739,
253
- "loss": 1.7883,
254
- "step": 740
255
- },
256
- {
257
- "epoch": 0.25,
258
- "learning_rate": 0.00017267904509283817,
259
- "loss": 1.8469,
260
- "step": 760
261
- },
262
- {
263
- "epoch": 0.26,
264
- "learning_rate": 0.000170026525198939,
265
- "loss": 1.8447,
266
- "step": 780
267
- },
268
- {
269
- "epoch": 0.27,
270
- "learning_rate": 0.00016737400530503977,
271
- "loss": 1.7872,
272
- "step": 800
273
- },
274
- {
275
- "epoch": 0.27,
276
- "eval_loss": 1.9704641103744507,
277
- "eval_runtime": 11963.7945,
278
- "eval_samples_per_second": 1.117,
279
- "eval_steps_per_second": 0.14,
280
- "step": 800
281
- },
282
- {
283
- "epoch": 0.27,
284
- "learning_rate": 0.0001647214854111406,
285
- "loss": 1.8032,
286
- "step": 820
287
- },
288
- {
289
- "epoch": 0.28,
290
- "learning_rate": 0.00016206896551724137,
291
- "loss": 1.8709,
292
- "step": 840
293
- },
294
- {
295
- "epoch": 0.29,
296
- "learning_rate": 0.00015941644562334216,
297
- "loss": 1.8801,
298
- "step": 860
299
- },
300
- {
301
- "epoch": 0.29,
302
- "learning_rate": 0.00015676392572944298,
303
- "loss": 1.8535,
304
- "step": 880
305
- },
306
- {
307
- "epoch": 0.3,
308
- "learning_rate": 0.00015411140583554373,
309
- "loss": 1.8631,
310
- "step": 900
311
- },
312
- {
313
- "epoch": 0.31,
314
- "learning_rate": 0.00015145888594164455,
315
- "loss": 1.752,
316
- "step": 920
317
- },
318
- {
319
- "epoch": 0.31,
320
- "learning_rate": 0.00014880636604774534,
321
- "loss": 1.8705,
322
- "step": 940
323
- },
324
- {
325
- "epoch": 0.32,
326
- "learning_rate": 0.00014615384615384615,
327
- "loss": 1.8301,
328
- "step": 960
329
- },
330
- {
331
- "epoch": 0.33,
332
- "learning_rate": 0.00014350132625994694,
333
- "loss": 1.7892,
334
- "step": 980
335
- },
336
- {
337
- "epoch": 0.33,
338
- "learning_rate": 0.00014084880636604772,
339
- "loss": 1.8671,
340
- "step": 1000
341
- },
342
- {
343
- "epoch": 0.33,
344
- "eval_loss": 1.9677250385284424,
345
- "eval_runtime": 11951.074,
346
- "eval_samples_per_second": 1.119,
347
- "eval_steps_per_second": 0.14,
348
- "step": 1000
349
- },
350
- {
351
- "epoch": 0.34,
352
- "learning_rate": 0.00013819628647214854,
353
- "loss": 1.8168,
354
- "step": 1020
355
- },
356
- {
357
- "epoch": 0.34,
358
- "learning_rate": 0.00013554376657824932,
359
- "loss": 1.8268,
360
- "step": 1040
361
- },
362
- {
363
- "epoch": 0.35,
364
- "learning_rate": 0.00013289124668435014,
365
- "loss": 1.8073,
366
- "step": 1060
367
- },
368
- {
369
- "epoch": 0.36,
370
- "learning_rate": 0.0001302387267904509,
371
- "loss": 1.9313,
372
- "step": 1080
373
- },
374
- {
375
- "epoch": 0.36,
376
- "learning_rate": 0.0001275862068965517,
377
- "loss": 1.9097,
378
- "step": 1100
379
- },
380
- {
381
- "epoch": 0.37,
382
- "learning_rate": 0.0001249336870026525,
383
- "loss": 1.9032,
384
- "step": 1120
385
- },
386
- {
387
- "epoch": 0.38,
388
- "learning_rate": 0.0001222811671087533,
389
- "loss": 1.8207,
390
- "step": 1140
391
- },
392
- {
393
- "epoch": 0.38,
394
- "learning_rate": 0.0001196286472148541,
395
- "loss": 1.8815,
396
- "step": 1160
397
- },
398
- {
399
- "epoch": 0.39,
400
- "learning_rate": 0.0001169761273209549,
401
- "loss": 1.8297,
402
- "step": 1180
403
- },
404
- {
405
- "epoch": 0.4,
406
- "learning_rate": 0.0001143236074270557,
407
- "loss": 1.8899,
408
- "step": 1200
409
- },
410
- {
411
- "epoch": 0.4,
412
- "eval_loss": 1.9722812175750732,
413
- "eval_runtime": 11951.1639,
414
- "eval_samples_per_second": 1.119,
415
- "eval_steps_per_second": 0.14,
416
- "step": 1200
417
- },
418
- {
419
- "epoch": 0.4,
420
- "learning_rate": 0.0001116710875331565,
421
- "loss": 1.8803,
422
- "step": 1220
423
- },
424
- {
425
- "epoch": 0.41,
426
- "learning_rate": 0.00010901856763925729,
427
- "loss": 1.8548,
428
- "step": 1240
429
- },
430
- {
431
- "epoch": 0.42,
432
- "learning_rate": 0.00010636604774535807,
433
- "loss": 1.8498,
434
- "step": 1260
435
- },
436
- {
437
- "epoch": 0.42,
438
- "learning_rate": 0.00010371352785145887,
439
- "loss": 1.8057,
440
- "step": 1280
441
- },
442
- {
443
- "epoch": 0.43,
444
- "learning_rate": 0.00010106100795755967,
445
- "loss": 1.8572,
446
- "step": 1300
447
- },
448
- {
449
- "epoch": 0.44,
450
- "learning_rate": 9.840848806366047e-05,
451
- "loss": 1.8469,
452
- "step": 1320
453
- },
454
- {
455
- "epoch": 0.44,
456
- "learning_rate": 9.575596816976126e-05,
457
- "loss": 1.8405,
458
- "step": 1340
459
- },
460
- {
461
- "epoch": 0.45,
462
- "learning_rate": 9.310344827586206e-05,
463
- "loss": 1.8193,
464
- "step": 1360
465
- },
466
- {
467
- "epoch": 0.46,
468
- "learning_rate": 9.045092838196286e-05,
469
- "loss": 1.7731,
470
- "step": 1380
471
- },
472
- {
473
- "epoch": 0.46,
474
- "learning_rate": 8.779840848806366e-05,
475
- "loss": 1.8329,
476
- "step": 1400
477
- },
478
- {
479
- "epoch": 0.46,
480
- "eval_loss": 1.9736511707305908,
481
- "eval_runtime": 11956.5916,
482
- "eval_samples_per_second": 1.118,
483
- "eval_steps_per_second": 0.14,
484
- "step": 1400
485
- },
486
- {
487
- "epoch": 0.47,
488
- "learning_rate": 8.514588859416444e-05,
489
- "loss": 1.8872,
490
- "step": 1420
491
- },
492
- {
493
- "epoch": 0.48,
494
- "learning_rate": 8.249336870026524e-05,
495
- "loss": 1.8511,
496
- "step": 1440
497
- },
498
- {
499
- "epoch": 0.48,
500
- "learning_rate": 7.984084880636604e-05,
501
- "loss": 1.7523,
502
- "step": 1460
503
- },
504
- {
505
- "epoch": 0.49,
506
- "learning_rate": 7.718832891246684e-05,
507
- "loss": 1.8619,
508
- "step": 1480
509
- },
510
- {
511
- "epoch": 0.5,
512
- "learning_rate": 7.453580901856764e-05,
513
- "loss": 1.8363,
514
- "step": 1500
515
- },
516
- {
517
- "epoch": 0.5,
518
- "learning_rate": 7.188328912466844e-05,
519
- "loss": 1.7745,
520
- "step": 1520
521
- },
522
- {
523
- "epoch": 0.51,
524
- "learning_rate": 6.923076923076922e-05,
525
- "loss": 1.8164,
526
- "step": 1540
527
- },
528
- {
529
- "epoch": 0.52,
530
- "learning_rate": 6.657824933687002e-05,
531
- "loss": 1.7997,
532
- "step": 1560
533
- },
534
- {
535
- "epoch": 0.52,
536
- "learning_rate": 6.392572944297081e-05,
537
- "loss": 1.8377,
538
- "step": 1580
539
- },
540
- {
541
- "epoch": 0.53,
542
- "learning_rate": 6.127320954907161e-05,
543
- "loss": 1.851,
544
- "step": 1600
545
- },
546
- {
547
- "epoch": 0.53,
548
- "eval_loss": 1.9729183912277222,
549
- "eval_runtime": 11970.5767,
550
- "eval_samples_per_second": 1.117,
551
- "eval_steps_per_second": 0.14,
552
- "step": 1600
553
- },
554
- {
555
- "epoch": 0.54,
556
- "learning_rate": 5.862068965517241e-05,
557
- "loss": 1.8382,
558
- "step": 1620
559
- },
560
- {
561
- "epoch": 0.54,
562
- "learning_rate": 5.59681697612732e-05,
563
- "loss": 1.8863,
564
- "step": 1640
565
- },
566
- {
567
- "epoch": 0.55,
568
- "learning_rate": 5.3315649867374e-05,
569
- "loss": 1.762,
570
- "step": 1660
571
- },
572
- {
573
- "epoch": 0.56,
574
- "learning_rate": 5.06631299734748e-05,
575
- "loss": 1.822,
576
- "step": 1680
577
- },
578
- {
579
- "epoch": 0.56,
580
- "learning_rate": 4.801061007957559e-05,
581
- "loss": 1.7776,
582
- "step": 1700
583
- },
584
- {
585
- "epoch": 0.57,
586
- "learning_rate": 4.5358090185676386e-05,
587
- "loss": 1.809,
588
- "step": 1720
589
- },
590
- {
591
- "epoch": 0.58,
592
- "learning_rate": 4.2705570291777186e-05,
593
- "loss": 1.912,
594
- "step": 1740
595
- },
596
- {
597
- "epoch": 0.58,
598
- "learning_rate": 4.005305039787798e-05,
599
- "loss": 1.7557,
600
- "step": 1760
601
- },
602
- {
603
- "epoch": 0.59,
604
- "learning_rate": 3.7400530503978774e-05,
605
- "loss": 1.8232,
606
- "step": 1780
607
- },
608
- {
609
- "epoch": 0.6,
610
- "learning_rate": 3.4748010610079574e-05,
611
- "loss": 1.8613,
612
- "step": 1800
613
- },
614
- {
615
- "epoch": 0.6,
616
- "eval_loss": 1.9704504013061523,
617
- "eval_runtime": 11980.5334,
618
- "eval_samples_per_second": 1.116,
619
- "eval_steps_per_second": 0.139,
620
- "step": 1800
621
- },
622
- {
623
- "epoch": 0.6,
624
- "learning_rate": 3.209549071618037e-05,
625
- "loss": 1.8378,
626
- "step": 1820
627
- },
628
- {
629
- "epoch": 0.61,
630
- "learning_rate": 2.9442970822281164e-05,
631
- "loss": 1.8871,
632
- "step": 1840
633
- },
634
- {
635
- "epoch": 0.62,
636
- "learning_rate": 2.679045092838196e-05,
637
- "loss": 1.8821,
638
- "step": 1860
639
- },
640
- {
641
- "epoch": 0.62,
642
- "learning_rate": 2.4137931034482755e-05,
643
- "loss": 1.8487,
644
- "step": 1880
645
- },
646
- {
647
- "epoch": 0.63,
648
- "learning_rate": 2.1485411140583555e-05,
649
- "loss": 1.8439,
650
- "step": 1900
651
- },
652
- {
653
- "epoch": 0.64,
654
- "learning_rate": 1.883289124668435e-05,
655
- "loss": 1.8469,
656
- "step": 1920
657
- },
658
- {
659
- "epoch": 0.64,
660
- "learning_rate": 1.6180371352785142e-05,
661
- "loss": 1.7797,
662
- "step": 1940
663
- },
664
- {
665
- "epoch": 0.65,
666
- "learning_rate": 1.352785145888594e-05,
667
- "loss": 1.7795,
668
- "step": 1960
669
- },
670
- {
671
- "epoch": 0.66,
672
- "learning_rate": 1.0875331564986736e-05,
673
- "loss": 1.8483,
674
- "step": 1980
675
- },
676
- {
677
- "epoch": 0.66,
678
- "learning_rate": 8.222811671087533e-06,
679
- "loss": 1.8321,
680
- "step": 2000
681
- },
682
- {
683
- "epoch": 0.66,
684
- "eval_loss": 1.9704707860946655,
685
- "eval_runtime": 11964.4776,
686
- "eval_samples_per_second": 1.117,
687
- "eval_steps_per_second": 0.14,
688
- "step": 2000
689
- },
690
- {
691
- "epoch": 0.67,
692
- "learning_rate": 5.5702917771883284e-06,
693
- "loss": 1.7708,
694
- "step": 2020
695
- },
696
- {
697
- "epoch": 0.68,
698
- "learning_rate": 2.9177718832891245e-06,
699
- "loss": 1.8731,
700
- "step": 2040
701
- },
702
- {
703
- "epoch": 0.68,
704
- "learning_rate": 2.652519893899204e-07,
705
- "loss": 1.8181,
706
- "step": 2060
707
- },
708
- {
709
- "epoch": 0.69,
710
- "learning_rate": 0.0,
711
- "loss": 1.8692,
712
- "step": 2080
713
- },
714
- {
715
- "epoch": 0.7,
716
- "learning_rate": 0.0,
717
- "loss": 1.8602,
718
- "step": 2100
719
- },
720
- {
721
- "epoch": 0.7,
722
- "learning_rate": 0.0,
723
- "loss": 1.8891,
724
- "step": 2120
725
- },
726
- {
727
- "epoch": 0.71,
728
- "learning_rate": 0.0,
729
- "loss": 1.8205,
730
- "step": 2140
731
- },
732
- {
733
- "epoch": 0.72,
734
- "learning_rate": 0.0,
735
- "loss": 1.8242,
736
- "step": 2160
737
- },
738
- {
739
- "epoch": 0.72,
740
- "learning_rate": 0.0,
741
- "loss": 1.8661,
742
- "step": 2180
743
- },
744
- {
745
- "epoch": 0.73,
746
- "learning_rate": 0.0,
747
- "loss": 1.7982,
748
- "step": 2200
749
- },
750
- {
751
- "epoch": 0.73,
752
- "eval_loss": 1.970503568649292,
753
- "eval_runtime": 11950.1314,
754
- "eval_samples_per_second": 1.119,
755
- "eval_steps_per_second": 0.14,
756
- "step": 2200
757
- },
758
- {
759
- "epoch": 0.74,
760
- "learning_rate": 0.0,
761
- "loss": 1.7967,
762
- "step": 2220
763
- },
764
- {
765
- "epoch": 0.74,
766
- "learning_rate": 0.0,
767
- "loss": 1.8346,
768
- "step": 2240
769
- },
770
- {
771
- "epoch": 0.75,
772
- "learning_rate": 0.0,
773
- "loss": 1.8242,
774
- "step": 2260
775
- },
776
- {
777
- "epoch": 0.76,
778
- "learning_rate": 0.0,
779
- "loss": 1.8429,
780
- "step": 2280
781
- },
782
- {
783
- "epoch": 0.76,
784
- "learning_rate": 0.0,
785
- "loss": 1.8795,
786
- "step": 2300
787
- },
788
- {
789
- "epoch": 0.77,
790
- "learning_rate": 0.0,
791
- "loss": 1.8967,
792
- "step": 2320
793
- },
794
- {
795
- "epoch": 0.78,
796
- "learning_rate": 0.0,
797
- "loss": 1.815,
798
- "step": 2340
799
- },
800
- {
801
- "epoch": 0.78,
802
- "learning_rate": 0.0,
803
- "loss": 1.8165,
804
- "step": 2360
805
- },
806
- {
807
- "epoch": 0.79,
808
- "learning_rate": 0.0,
809
- "loss": 1.838,
810
- "step": 2380
811
- },
812
- {
813
- "epoch": 0.8,
814
- "learning_rate": 0.0,
815
- "loss": 1.7798,
816
- "step": 2400
817
- },
818
- {
819
- "epoch": 0.8,
820
- "eval_loss": 1.970503568649292,
821
- "eval_runtime": 11951.0888,
822
- "eval_samples_per_second": 1.119,
823
- "eval_steps_per_second": 0.14,
824
- "step": 2400
825
- },
826
- {
827
- "epoch": 0.8,
828
- "learning_rate": 0.0,
829
- "loss": 1.7999,
830
- "step": 2420
831
- },
832
- {
833
- "epoch": 0.81,
834
- "learning_rate": 0.0,
835
- "loss": 1.7957,
836
- "step": 2440
837
- },
838
- {
839
- "epoch": 0.82,
840
- "learning_rate": 0.0,
841
- "loss": 1.8306,
842
- "step": 2460
843
- },
844
- {
845
- "epoch": 0.82,
846
- "learning_rate": 0.0,
847
- "loss": 1.87,
848
- "step": 2480
849
- },
850
- {
851
- "epoch": 0.83,
852
- "learning_rate": 0.0,
853
- "loss": 1.8046,
854
- "step": 2500
855
- },
856
- {
857
- "epoch": 0.84,
858
- "learning_rate": 0.0,
859
- "loss": 1.76,
860
- "step": 2520
861
- },
862
- {
863
- "epoch": 0.84,
864
- "learning_rate": 0.0,
865
- "loss": 1.869,
866
- "step": 2540
867
- },
868
- {
869
- "epoch": 0.85,
870
- "learning_rate": 0.0,
871
- "loss": 1.8684,
872
- "step": 2560
873
- },
874
- {
875
- "epoch": 0.86,
876
- "learning_rate": 0.0,
877
- "loss": 1.8757,
878
- "step": 2580
879
- },
880
- {
881
- "epoch": 0.86,
882
- "learning_rate": 0.0,
883
- "loss": 1.868,
884
- "step": 2600
885
- },
886
- {
887
- "epoch": 0.86,
888
- "eval_loss": 1.970503568649292,
889
- "eval_runtime": 11952.488,
890
- "eval_samples_per_second": 1.118,
891
- "eval_steps_per_second": 0.14,
892
- "step": 2600
893
- },
894
- {
895
- "epoch": 0.87,
896
- "learning_rate": 0.0,
897
- "loss": 1.8228,
898
- "step": 2620
899
- },
900
- {
901
- "epoch": 0.88,
902
- "learning_rate": 0.0,
903
- "loss": 1.8721,
904
- "step": 2640
905
- },
906
- {
907
- "epoch": 0.88,
908
- "learning_rate": 0.0,
909
- "loss": 1.8557,
910
- "step": 2660
911
- },
912
- {
913
- "epoch": 0.89,
914
- "learning_rate": 0.0,
915
- "loss": 1.8545,
916
- "step": 2680
917
- },
918
- {
919
- "epoch": 0.9,
920
- "learning_rate": 0.0,
921
- "loss": 1.8366,
922
- "step": 2700
923
- },
924
- {
925
- "epoch": 0.9,
926
- "learning_rate": 0.0,
927
- "loss": 1.8381,
928
- "step": 2720
929
- },
930
- {
931
- "epoch": 0.91,
932
- "learning_rate": 0.0,
933
- "loss": 1.8305,
934
- "step": 2740
935
- },
936
- {
937
- "epoch": 0.92,
938
- "learning_rate": 0.0,
939
- "loss": 1.8162,
940
- "step": 2760
941
- },
942
- {
943
- "epoch": 0.92,
944
- "learning_rate": 0.0,
945
- "loss": 1.8109,
946
- "step": 2780
947
- },
948
- {
949
- "epoch": 0.93,
950
- "learning_rate": 0.0,
951
- "loss": 1.8694,
952
- "step": 2800
953
- },
954
- {
955
- "epoch": 0.93,
956
- "eval_loss": 1.970503568649292,
957
- "eval_runtime": 11969.9504,
958
- "eval_samples_per_second": 1.117,
959
- "eval_steps_per_second": 0.14,
960
- "step": 2800
961
- },
962
- {
963
- "epoch": 0.94,
964
- "learning_rate": 0.0,
965
- "loss": 1.8201,
966
- "step": 2820
967
- },
968
- {
969
- "epoch": 0.94,
970
- "learning_rate": 0.0,
971
- "loss": 1.848,
972
- "step": 2840
973
- },
974
- {
975
- "epoch": 0.95,
976
- "learning_rate": 0.0,
977
- "loss": 1.8138,
978
- "step": 2860
979
- },
980
- {
981
- "epoch": 0.96,
982
- "learning_rate": 0.0,
983
- "loss": 1.8554,
984
- "step": 2880
985
- },
986
- {
987
- "epoch": 0.96,
988
- "learning_rate": 0.0,
989
- "loss": 1.8321,
990
- "step": 2900
991
- },
992
- {
993
- "epoch": 0.97,
994
- "learning_rate": 0.0,
995
- "loss": 1.8404,
996
- "step": 2920
997
- },
998
- {
999
- "epoch": 0.98,
1000
- "learning_rate": 0.0,
1001
- "loss": 1.7883,
1002
- "step": 2940
1003
- },
1004
- {
1005
- "epoch": 0.98,
1006
- "learning_rate": 0.0,
1007
- "loss": 1.8733,
1008
- "step": 2960
1009
- },
1010
- {
1011
- "epoch": 0.99,
1012
- "learning_rate": 0.0,
1013
- "loss": 1.8074,
1014
- "step": 2980
1015
- },
1016
- {
1017
- "epoch": 1.0,
1018
- "learning_rate": 0.0,
1019
- "loss": 1.887,
1020
- "step": 3000
1021
- },
1022
- {
1023
- "epoch": 1.0,
1024
- "eval_loss": 1.970503568649292,
1025
- "eval_runtime": 11974.5637,
1026
- "eval_samples_per_second": 1.116,
1027
- "eval_steps_per_second": 0.14,
1028
- "step": 3000
1029
- },
1030
- {
1031
- "epoch": 1.0,
1032
- "learning_rate": 0.0,
1033
- "loss": 1.7165,
1034
- "step": 3020
1035
- },
1036
- {
1037
- "epoch": 1.01,
1038
- "learning_rate": 0.0,
1039
- "loss": 1.8041,
1040
- "step": 3040
1041
- },
1042
- {
1043
- "epoch": 1.01,
1044
- "learning_rate": 0.0,
1045
- "loss": 1.8031,
1046
- "step": 3060
1047
- },
1048
- {
1049
- "epoch": 1.02,
1050
- "learning_rate": 0.0,
1051
- "loss": 1.8493,
1052
- "step": 3080
1053
- },
1054
- {
1055
- "epoch": 1.03,
1056
- "learning_rate": 0.0,
1057
- "loss": 1.8475,
1058
- "step": 3100
1059
- },
1060
- {
1061
- "epoch": 1.03,
1062
- "learning_rate": 0.0,
1063
- "loss": 1.7994,
1064
- "step": 3120
1065
- },
1066
- {
1067
- "epoch": 1.04,
1068
- "learning_rate": 0.0,
1069
- "loss": 1.7904,
1070
- "step": 3140
1071
- },
1072
- {
1073
- "epoch": 1.05,
1074
- "learning_rate": 0.0,
1075
- "loss": 1.8078,
1076
- "step": 3160
1077
- },
1078
- {
1079
- "epoch": 1.05,
1080
- "learning_rate": 0.0,
1081
- "loss": 1.8625,
1082
- "step": 3180
1083
- },
1084
- {
1085
- "epoch": 1.06,
1086
- "learning_rate": 0.0,
1087
- "loss": 1.7874,
1088
- "step": 3200
1089
- },
1090
- {
1091
- "epoch": 1.06,
1092
- "eval_loss": 1.970503568649292,
1093
- "eval_runtime": 11966.566,
1094
- "eval_samples_per_second": 1.117,
1095
- "eval_steps_per_second": 0.14,
1096
- "step": 3200
1097
- },
1098
- {
1099
- "epoch": 1.07,
1100
- "learning_rate": 0.0,
1101
- "loss": 1.7956,
1102
- "step": 3220
1103
- },
1104
- {
1105
- "epoch": 1.07,
1106
- "learning_rate": 0.0,
1107
- "loss": 1.7517,
1108
- "step": 3240
1109
- },
1110
- {
1111
- "epoch": 1.08,
1112
- "learning_rate": 0.0,
1113
- "loss": 1.7861,
1114
- "step": 3260
1115
- },
1116
- {
1117
- "epoch": 1.09,
1118
- "learning_rate": 0.0,
1119
- "loss": 1.7957,
1120
- "step": 3280
1121
- },
1122
- {
1123
- "epoch": 1.09,
1124
- "learning_rate": 0.0,
1125
- "loss": 1.8147,
1126
- "step": 3300
1127
- },
1128
- {
1129
- "epoch": 1.1,
1130
- "learning_rate": 0.0,
1131
- "loss": 1.7795,
1132
- "step": 3320
1133
- },
1134
- {
1135
- "epoch": 1.11,
1136
- "learning_rate": 0.0,
1137
- "loss": 1.7532,
1138
- "step": 3340
1139
- },
1140
- {
1141
- "epoch": 1.11,
1142
- "learning_rate": 0.0,
1143
- "loss": 1.8414,
1144
- "step": 3360
1145
- },
1146
- {
1147
- "epoch": 1.12,
1148
- "learning_rate": 0.0,
1149
- "loss": 1.771,
1150
- "step": 3380
1151
- },
1152
- {
1153
- "epoch": 1.13,
1154
- "learning_rate": 0.0,
1155
- "loss": 1.8119,
1156
- "step": 3400
1157
- },
1158
- {
1159
- "epoch": 1.13,
1160
- "eval_loss": 1.970503568649292,
1161
- "eval_runtime": 11954.7549,
1162
- "eval_samples_per_second": 1.118,
1163
- "eval_steps_per_second": 0.14,
1164
- "step": 3400
1165
- },
1166
- {
1167
- "epoch": 1.13,
1168
- "learning_rate": 0.0,
1169
- "loss": 1.7944,
1170
- "step": 3420
1171
- },
1172
- {
1173
- "epoch": 1.14,
1174
- "learning_rate": 0.0,
1175
- "loss": 1.7855,
1176
- "step": 3440
1177
- },
1178
- {
1179
- "epoch": 1.15,
1180
- "learning_rate": 0.0,
1181
- "loss": 1.8574,
1182
- "step": 3460
1183
- },
1184
- {
1185
- "epoch": 1.15,
1186
- "learning_rate": 0.0,
1187
- "loss": 1.8189,
1188
- "step": 3480
1189
- },
1190
- {
1191
- "epoch": 1.16,
1192
- "learning_rate": 0.0,
1193
- "loss": 1.8033,
1194
- "step": 3500
1195
- },
1196
- {
1197
- "epoch": 1.17,
1198
- "learning_rate": 0.0,
1199
- "loss": 1.7689,
1200
- "step": 3520
1201
- },
1202
- {
1203
- "epoch": 1.17,
1204
- "learning_rate": 0.0,
1205
- "loss": 1.8455,
1206
- "step": 3540
1207
- },
1208
- {
1209
- "epoch": 1.18,
1210
- "learning_rate": 0.0,
1211
- "loss": 1.8255,
1212
- "step": 3560
1213
- },
1214
- {
1215
- "epoch": 1.19,
1216
- "learning_rate": 0.0,
1217
- "loss": 1.8431,
1218
- "step": 3580
1219
- },
1220
- {
1221
- "epoch": 1.19,
1222
- "learning_rate": 0.0,
1223
- "loss": 1.7924,
1224
- "step": 3600
1225
- },
1226
- {
1227
- "epoch": 1.19,
1228
- "eval_loss": 1.970503568649292,
1229
- "eval_runtime": 11953.5885,
1230
- "eval_samples_per_second": 1.118,
1231
- "eval_steps_per_second": 0.14,
1232
- "step": 3600
1233
- },
1234
- {
1235
- "epoch": 1.2,
1236
- "learning_rate": 0.0,
1237
- "loss": 1.8475,
1238
- "step": 3620
1239
- },
1240
- {
1241
- "epoch": 1.21,
1242
- "learning_rate": 0.0,
1243
- "loss": 1.6746,
1244
- "step": 3640
1245
- },
1246
- {
1247
- "epoch": 1.21,
1248
- "learning_rate": 0.0,
1249
- "loss": 1.8006,
1250
- "step": 3660
1251
- },
1252
- {
1253
- "epoch": 1.22,
1254
- "learning_rate": 0.0,
1255
- "loss": 1.816,
1256
- "step": 3680
1257
- },
1258
- {
1259
- "epoch": 1.23,
1260
- "learning_rate": 0.0,
1261
- "loss": 1.7486,
1262
- "step": 3700
1263
- },
1264
- {
1265
- "epoch": 1.23,
1266
- "learning_rate": 0.0,
1267
- "loss": 1.8086,
1268
- "step": 3720
1269
- },
1270
- {
1271
- "epoch": 1.24,
1272
- "learning_rate": 0.0,
1273
- "loss": 1.8241,
1274
- "step": 3740
1275
- },
1276
- {
1277
- "epoch": 1.25,
1278
- "learning_rate": 0.0,
1279
- "loss": 1.823,
1280
- "step": 3760
1281
- },
1282
- {
1283
- "epoch": 1.25,
1284
- "learning_rate": 0.0,
1285
- "loss": 1.7365,
1286
- "step": 3780
1287
- },
1288
- {
1289
- "epoch": 1.26,
1290
- "learning_rate": 0.0,
1291
- "loss": 1.801,
1292
- "step": 3800
1293
- },
1294
- {
1295
- "epoch": 1.26,
1296
- "eval_loss": 1.970503568649292,
1297
- "eval_runtime": 11952.013,
1298
- "eval_samples_per_second": 1.118,
1299
- "eval_steps_per_second": 0.14,
1300
- "step": 3800
1301
- },
1302
- {
1303
- "epoch": 1.27,
1304
- "learning_rate": 0.0,
1305
- "loss": 1.8497,
1306
- "step": 3820
1307
- },
1308
- {
1309
- "epoch": 1.27,
1310
- "learning_rate": 0.0,
1311
- "loss": 1.7803,
1312
- "step": 3840
1313
- },
1314
- {
1315
- "epoch": 1.28,
1316
- "learning_rate": 0.0,
1317
- "loss": 1.6921,
1318
- "step": 3860
1319
- },
1320
- {
1321
- "epoch": 1.29,
1322
- "learning_rate": 0.0,
1323
- "loss": 1.8508,
1324
- "step": 3880
1325
- },
1326
- {
1327
- "epoch": 1.29,
1328
- "learning_rate": 0.0,
1329
- "loss": 1.8816,
1330
- "step": 3900
1331
- },
1332
- {
1333
- "epoch": 1.3,
1334
- "learning_rate": 0.0,
1335
- "loss": 1.8105,
1336
- "step": 3920
1337
- },
1338
- {
1339
- "epoch": 1.31,
1340
- "learning_rate": 0.0,
1341
- "loss": 1.8477,
1342
- "step": 3940
1343
- },
1344
- {
1345
- "epoch": 1.31,
1346
- "learning_rate": 0.0,
1347
- "loss": 1.841,
1348
- "step": 3960
1349
- },
1350
- {
1351
- "epoch": 1.32,
1352
- "learning_rate": 0.0,
1353
- "loss": 1.7599,
1354
- "step": 3980
1355
- },
1356
- {
1357
- "epoch": 1.33,
1358
- "learning_rate": 0.0,
1359
- "loss": 1.7074,
1360
- "step": 4000
1361
- },
1362
- {
1363
- "epoch": 1.33,
1364
- "eval_loss": 1.970503568649292,
1365
- "eval_runtime": 11952.3623,
1366
- "eval_samples_per_second": 1.118,
1367
- "eval_steps_per_second": 0.14,
1368
- "step": 4000
1369
- },
1370
- {
1371
- "epoch": 1.33,
1372
- "learning_rate": 0.0,
1373
- "loss": 1.7529,
1374
- "step": 4020
1375
- },
1376
- {
1377
- "epoch": 1.34,
1378
- "learning_rate": 0.0,
1379
- "loss": 1.8928,
1380
- "step": 4040
1381
- },
1382
- {
1383
- "epoch": 1.35,
1384
- "learning_rate": 0.0,
1385
- "loss": 1.8585,
1386
- "step": 4060
1387
- },
1388
- {
1389
- "epoch": 1.35,
1390
- "learning_rate": 0.0,
1391
- "loss": 1.8279,
1392
- "step": 4080
1393
- },
1394
- {
1395
- "epoch": 1.36,
1396
- "learning_rate": 0.0,
1397
- "loss": 1.7949,
1398
- "step": 4100
1399
- },
1400
- {
1401
- "epoch": 1.37,
1402
- "learning_rate": 0.0,
1403
- "loss": 1.8488,
1404
- "step": 4120
1405
- },
1406
- {
1407
- "epoch": 1.37,
1408
- "learning_rate": 0.0,
1409
- "loss": 1.7572,
1410
- "step": 4140
1411
- },
1412
- {
1413
- "epoch": 1.38,
1414
- "learning_rate": 0.0,
1415
- "loss": 1.7533,
1416
- "step": 4160
1417
- },
1418
- {
1419
- "epoch": 1.39,
1420
- "learning_rate": 0.0,
1421
- "loss": 1.772,
1422
- "step": 4180
1423
- },
1424
- {
1425
- "epoch": 1.39,
1426
- "learning_rate": 0.0,
1427
- "loss": 1.8338,
1428
- "step": 4200
1429
- },
1430
- {
1431
- "epoch": 1.39,
1432
- "eval_loss": 1.970503568649292,
1433
- "eval_runtime": 11954.2113,
1434
- "eval_samples_per_second": 1.118,
1435
- "eval_steps_per_second": 0.14,
1436
- "step": 4200
1437
- },
1438
- {
1439
- "epoch": 1.4,
1440
- "learning_rate": 0.0,
1441
- "loss": 1.8084,
1442
- "step": 4220
1443
- },
1444
- {
1445
- "epoch": 1.41,
1446
- "learning_rate": 0.0,
1447
- "loss": 1.8791,
1448
- "step": 4240
1449
- },
1450
- {
1451
- "epoch": 1.41,
1452
- "learning_rate": 0.0,
1453
- "loss": 1.7906,
1454
- "step": 4260
1455
- },
1456
- {
1457
- "epoch": 1.42,
1458
- "learning_rate": 0.0,
1459
- "loss": 1.8669,
1460
- "step": 4280
1461
- },
1462
- {
1463
- "epoch": 1.43,
1464
- "learning_rate": 0.0,
1465
- "loss": 1.8108,
1466
- "step": 4300
1467
- },
1468
- {
1469
- "epoch": 1.43,
1470
- "learning_rate": 0.0,
1471
- "loss": 1.7769,
1472
- "step": 4320
1473
- },
1474
- {
1475
- "epoch": 1.44,
1476
- "learning_rate": 0.0,
1477
- "loss": 1.7599,
1478
- "step": 4340
1479
- },
1480
- {
1481
- "epoch": 1.45,
1482
- "learning_rate": 0.0,
1483
- "loss": 1.843,
1484
- "step": 4360
1485
- },
1486
- {
1487
- "epoch": 1.45,
1488
- "learning_rate": 0.0,
1489
- "loss": 1.8608,
1490
- "step": 4380
1491
- },
1492
- {
1493
- "epoch": 1.46,
1494
- "learning_rate": 0.0,
1495
- "loss": 1.8382,
1496
- "step": 4400
1497
- },
1498
- {
1499
- "epoch": 1.46,
1500
- "eval_loss": 1.970503568649292,
1501
- "eval_runtime": 11958.7166,
1502
- "eval_samples_per_second": 1.118,
1503
- "eval_steps_per_second": 0.14,
1504
- "step": 4400
1505
- },
1506
- {
1507
- "epoch": 1.47,
1508
- "learning_rate": 0.0,
1509
- "loss": 1.8002,
1510
- "step": 4420
1511
- },
1512
- {
1513
- "epoch": 1.47,
1514
- "learning_rate": 0.0,
1515
- "loss": 1.8292,
1516
- "step": 4440
1517
- },
1518
- {
1519
- "epoch": 1.48,
1520
- "learning_rate": 0.0,
1521
- "loss": 1.8184,
1522
- "step": 4460
1523
- },
1524
- {
1525
- "epoch": 1.49,
1526
- "learning_rate": 0.0,
1527
- "loss": 1.7712,
1528
- "step": 4480
1529
- },
1530
- {
1531
- "epoch": 1.49,
1532
- "learning_rate": 0.0,
1533
- "loss": 1.8307,
1534
- "step": 4500
1535
- },
1536
- {
1537
- "epoch": 1.5,
1538
- "learning_rate": 0.0,
1539
- "loss": 1.7883,
1540
- "step": 4520
1541
- },
1542
- {
1543
- "epoch": 1.51,
1544
- "learning_rate": 0.0,
1545
- "loss": 1.8106,
1546
- "step": 4540
1547
- },
1548
- {
1549
- "epoch": 1.51,
1550
- "learning_rate": 0.0,
1551
- "loss": 1.7999,
1552
- "step": 4560
1553
- },
1554
- {
1555
- "epoch": 1.52,
1556
- "learning_rate": 0.0,
1557
- "loss": 1.8353,
1558
- "step": 4580
1559
- },
1560
- {
1561
- "epoch": 1.53,
1562
- "learning_rate": 0.0,
1563
- "loss": 1.8366,
1564
- "step": 4600
1565
- },
1566
- {
1567
- "epoch": 1.53,
1568
- "eval_loss": 1.970503568649292,
1569
- "eval_runtime": 11971.0491,
1570
- "eval_samples_per_second": 1.117,
1571
- "eval_steps_per_second": 0.14,
1572
- "step": 4600
1573
- },
1574
- {
1575
- "epoch": 1.53,
1576
- "learning_rate": 0.0,
1577
- "loss": 1.8018,
1578
- "step": 4620
1579
- },
1580
- {
1581
- "epoch": 1.54,
1582
- "learning_rate": 0.0,
1583
- "loss": 1.84,
1584
- "step": 4640
1585
- },
1586
- {
1587
- "epoch": 1.55,
1588
- "learning_rate": 0.0,
1589
- "loss": 1.7784,
1590
- "step": 4660
1591
- },
1592
- {
1593
- "epoch": 1.55,
1594
- "learning_rate": 0.0,
1595
- "loss": 1.794,
1596
- "step": 4680
1597
- },
1598
- {
1599
- "epoch": 1.56,
1600
- "learning_rate": 0.0,
1601
- "loss": 1.8237,
1602
- "step": 4700
1603
- },
1604
- {
1605
- "epoch": 1.57,
1606
- "learning_rate": 0.0,
1607
- "loss": 1.7697,
1608
- "step": 4720
1609
- },
1610
- {
1611
- "epoch": 1.57,
1612
- "learning_rate": 0.0,
1613
- "loss": 1.8482,
1614
- "step": 4740
1615
- },
1616
- {
1617
- "epoch": 1.58,
1618
- "learning_rate": 0.0,
1619
- "loss": 1.8008,
1620
- "step": 4760
1621
- },
1622
- {
1623
- "epoch": 1.59,
1624
- "learning_rate": 0.0,
1625
- "loss": 1.8082,
1626
- "step": 4780
1627
- },
1628
- {
1629
- "epoch": 1.59,
1630
- "learning_rate": 0.0,
1631
- "loss": 1.7799,
1632
- "step": 4800
1633
- },
1634
- {
1635
- "epoch": 1.59,
1636
- "eval_loss": 1.970503568649292,
1637
- "eval_runtime": 11955.9821,
1638
- "eval_samples_per_second": 1.118,
1639
- "eval_steps_per_second": 0.14,
1640
- "step": 4800
1641
- },
1642
- {
1643
- "epoch": 1.6,
1644
- "learning_rate": 0.0,
1645
- "loss": 1.8339,
1646
- "step": 4820
1647
- },
1648
- {
1649
- "epoch": 1.61,
1650
- "learning_rate": 0.0,
1651
- "loss": 1.8072,
1652
- "step": 4840
1653
- },
1654
- {
1655
- "epoch": 1.61,
1656
- "learning_rate": 0.0,
1657
- "loss": 1.8024,
1658
- "step": 4860
1659
- },
1660
- {
1661
- "epoch": 1.62,
1662
- "learning_rate": 0.0,
1663
- "loss": 1.8609,
1664
- "step": 4880
1665
- },
1666
- {
1667
- "epoch": 1.63,
1668
- "learning_rate": 0.0,
1669
- "loss": 1.8769,
1670
- "step": 4900
1671
- },
1672
- {
1673
- "epoch": 1.63,
1674
- "learning_rate": 0.0,
1675
- "loss": 1.808,
1676
- "step": 4920
1677
- },
1678
- {
1679
- "epoch": 1.64,
1680
- "learning_rate": 0.0,
1681
- "loss": 1.8482,
1682
- "step": 4940
1683
- },
1684
- {
1685
- "epoch": 1.65,
1686
- "learning_rate": 0.0,
1687
- "loss": 1.8116,
1688
- "step": 4960
1689
- },
1690
- {
1691
- "epoch": 1.65,
1692
- "learning_rate": 0.0,
1693
- "loss": 1.7922,
1694
- "step": 4980
1695
- },
1696
- {
1697
- "epoch": 1.66,
1698
- "learning_rate": 0.0,
1699
- "loss": 1.8409,
1700
- "step": 5000
1701
- },
1702
- {
1703
- "epoch": 1.66,
1704
- "eval_loss": 1.970503568649292,
1705
- "eval_runtime": 11953.6568,
1706
- "eval_samples_per_second": 1.118,
1707
- "eval_steps_per_second": 0.14,
1708
- "step": 5000
1709
- },
1710
- {
1711
- "epoch": 1.67,
1712
- "learning_rate": 0.0,
1713
- "loss": 1.867,
1714
- "step": 5020
1715
- },
1716
- {
1717
- "epoch": 1.67,
1718
- "learning_rate": 0.0,
1719
- "loss": 1.8902,
1720
- "step": 5040
1721
- },
1722
- {
1723
- "epoch": 1.68,
1724
- "learning_rate": 0.0,
1725
- "loss": 1.8835,
1726
- "step": 5060
1727
- },
1728
- {
1729
- "epoch": 1.68,
1730
- "learning_rate": 0.0,
1731
- "loss": 1.741,
1732
- "step": 5080
1733
- },
1734
- {
1735
- "epoch": 1.69,
1736
- "learning_rate": 0.0,
1737
- "loss": 1.7543,
1738
- "step": 5100
1739
- },
1740
- {
1741
- "epoch": 1.7,
1742
- "learning_rate": 0.0,
1743
- "loss": 1.7913,
1744
- "step": 5120
1745
- },
1746
- {
1747
- "epoch": 1.7,
1748
- "learning_rate": 0.0,
1749
- "loss": 1.829,
1750
- "step": 5140
1751
- },
1752
- {
1753
- "epoch": 1.71,
1754
- "learning_rate": 0.0,
1755
- "loss": 1.7915,
1756
- "step": 5160
1757
- },
1758
- {
1759
- "epoch": 1.72,
1760
- "learning_rate": 0.0,
1761
- "loss": 1.767,
1762
- "step": 5180
1763
- },
1764
- {
1765
- "epoch": 1.72,
1766
- "learning_rate": 0.0,
1767
- "loss": 1.7809,
1768
- "step": 5200
1769
- },
1770
- {
1771
- "epoch": 1.72,
1772
- "eval_loss": 1.970503568649292,
1773
- "eval_runtime": 11951.295,
1774
- "eval_samples_per_second": 1.119,
1775
- "eval_steps_per_second": 0.14,
1776
- "step": 5200
1777
- },
1778
- {
1779
- "epoch": 1.73,
1780
- "learning_rate": 0.0,
1781
- "loss": 1.8715,
1782
- "step": 5220
1783
- },
1784
- {
1785
- "epoch": 1.74,
1786
- "learning_rate": 0.0,
1787
- "loss": 1.8517,
1788
- "step": 5240
1789
- },
1790
- {
1791
- "epoch": 1.74,
1792
- "learning_rate": 0.0,
1793
- "loss": 1.8255,
1794
- "step": 5260
1795
- },
1796
- {
1797
- "epoch": 1.75,
1798
- "learning_rate": 0.0,
1799
- "loss": 1.7973,
1800
- "step": 5280
1801
- },
1802
- {
1803
- "epoch": 1.76,
1804
- "learning_rate": 0.0,
1805
- "loss": 1.7912,
1806
- "step": 5300
1807
- },
1808
- {
1809
- "epoch": 1.76,
1810
- "learning_rate": 0.0,
1811
- "loss": 1.8331,
1812
- "step": 5320
1813
- },
1814
- {
1815
- "epoch": 1.77,
1816
- "learning_rate": 0.0,
1817
- "loss": 1.835,
1818
- "step": 5340
1819
- },
1820
- {
1821
- "epoch": 1.78,
1822
- "learning_rate": 0.0,
1823
- "loss": 1.8051,
1824
- "step": 5360
1825
- },
1826
- {
1827
- "epoch": 1.78,
1828
- "learning_rate": 0.0,
1829
- "loss": 1.8007,
1830
- "step": 5380
1831
- },
1832
- {
1833
- "epoch": 1.79,
1834
- "learning_rate": 0.0,
1835
- "loss": 1.8069,
1836
- "step": 5400
1837
- },
1838
- {
1839
- "epoch": 1.79,
1840
- "eval_loss": 1.970503568649292,
1841
- "eval_runtime": 11948.5086,
1842
- "eval_samples_per_second": 1.119,
1843
- "eval_steps_per_second": 0.14,
1844
- "step": 5400
1845
- },
1846
- {
1847
- "epoch": 1.8,
1848
- "learning_rate": 0.0,
1849
- "loss": 1.784,
1850
- "step": 5420
1851
- },
1852
- {
1853
- "epoch": 1.8,
1854
- "learning_rate": 0.0,
1855
- "loss": 1.7557,
1856
- "step": 5440
1857
- },
1858
- {
1859
- "epoch": 1.81,
1860
- "learning_rate": 0.0,
1861
- "loss": 1.8011,
1862
- "step": 5460
1863
- },
1864
- {
1865
- "epoch": 1.82,
1866
- "learning_rate": 0.0,
1867
- "loss": 1.8392,
1868
- "step": 5480
1869
- },
1870
- {
1871
- "epoch": 1.82,
1872
- "learning_rate": 0.0,
1873
- "loss": 1.7507,
1874
- "step": 5500
1875
- },
1876
- {
1877
- "epoch": 1.83,
1878
- "learning_rate": 0.0,
1879
- "loss": 1.8043,
1880
- "step": 5520
1881
- },
1882
- {
1883
- "epoch": 1.84,
1884
- "learning_rate": 0.0,
1885
- "loss": 1.7804,
1886
- "step": 5540
1887
- },
1888
- {
1889
- "epoch": 1.84,
1890
- "learning_rate": 0.0,
1891
- "loss": 1.811,
1892
- "step": 5560
1893
- },
1894
- {
1895
- "epoch": 1.85,
1896
- "learning_rate": 0.0,
1897
- "loss": 1.806,
1898
- "step": 5580
1899
- },
1900
- {
1901
- "epoch": 1.86,
1902
- "learning_rate": 0.0,
1903
- "loss": 1.8103,
1904
- "step": 5600
1905
- },
1906
- {
1907
- "epoch": 1.86,
1908
- "eval_loss": 1.970503568649292,
1909
- "eval_runtime": 11953.5582,
1910
- "eval_samples_per_second": 1.118,
1911
- "eval_steps_per_second": 0.14,
1912
- "step": 5600
1913
- },
1914
- {
1915
- "epoch": 1.86,
1916
- "learning_rate": 0.0,
1917
- "loss": 1.7947,
1918
- "step": 5620
1919
- },
1920
- {
1921
- "epoch": 1.87,
1922
- "learning_rate": 0.0,
1923
- "loss": 1.8062,
1924
- "step": 5640
1925
- },
1926
- {
1927
- "epoch": 1.88,
1928
- "learning_rate": 0.0,
1929
- "loss": 1.7997,
1930
- "step": 5660
1931
- },
1932
- {
1933
- "epoch": 1.88,
1934
- "learning_rate": 0.0,
1935
- "loss": 1.843,
1936
- "step": 5680
1937
- },
1938
- {
1939
- "epoch": 1.89,
1940
- "learning_rate": 0.0,
1941
- "loss": 1.848,
1942
- "step": 5700
1943
- },
1944
- {
1945
- "epoch": 1.9,
1946
- "learning_rate": 0.0,
1947
- "loss": 1.8393,
1948
- "step": 5720
1949
- },
1950
- {
1951
- "epoch": 1.9,
1952
- "learning_rate": 0.0,
1953
- "loss": 1.8187,
1954
- "step": 5740
1955
- },
1956
- {
1957
- "epoch": 1.91,
1958
- "learning_rate": 0.0,
1959
- "loss": 1.802,
1960
- "step": 5760
1961
- },
1962
- {
1963
- "epoch": 1.92,
1964
- "learning_rate": 0.0,
1965
- "loss": 1.7646,
1966
- "step": 5780
1967
- },
1968
- {
1969
- "epoch": 1.92,
1970
- "learning_rate": 0.0,
1971
- "loss": 1.7993,
1972
- "step": 5800
1973
- },
1974
- {
1975
- "epoch": 1.92,
1976
- "eval_loss": 1.970503568649292,
1977
- "eval_runtime": 11953.0956,
1978
- "eval_samples_per_second": 1.118,
1979
- "eval_steps_per_second": 0.14,
1980
- "step": 5800
1981
- },
1982
- {
1983
- "epoch": 1.93,
1984
- "learning_rate": 0.0,
1985
- "loss": 1.7962,
1986
- "step": 5820
1987
- },
1988
- {
1989
- "epoch": 1.94,
1990
- "learning_rate": 0.0,
1991
- "loss": 1.849,
1992
- "step": 5840
1993
- },
1994
- {
1995
- "epoch": 1.94,
1996
- "learning_rate": 0.0,
1997
- "loss": 1.8254,
1998
- "step": 5860
1999
- },
2000
- {
2001
- "epoch": 1.95,
2002
- "learning_rate": 0.0,
2003
- "loss": 1.8583,
2004
- "step": 5880
2005
- },
2006
- {
2007
- "epoch": 1.96,
2008
- "learning_rate": 0.0,
2009
- "loss": 1.8398,
2010
- "step": 5900
2011
- },
2012
- {
2013
- "epoch": 1.96,
2014
- "learning_rate": 0.0,
2015
- "loss": 1.7306,
2016
- "step": 5920
2017
- },
2018
- {
2019
- "epoch": 1.97,
2020
- "learning_rate": 0.0,
2021
- "loss": 1.81,
2022
- "step": 5940
2023
- },
2024
- {
2025
- "epoch": 1.98,
2026
- "learning_rate": 0.0,
2027
- "loss": 1.7762,
2028
- "step": 5960
2029
- },
2030
- {
2031
- "epoch": 1.98,
2032
- "learning_rate": 0.0,
2033
- "loss": 1.8549,
2034
- "step": 5980
2035
- },
2036
- {
2037
- "epoch": 1.99,
2038
- "learning_rate": 0.0,
2039
- "loss": 1.84,
2040
- "step": 6000
2041
- },
2042
- {
2043
- "epoch": 1.99,
2044
- "eval_loss": 1.970503568649292,
2045
- "eval_runtime": 11950.3766,
2046
- "eval_samples_per_second": 1.119,
2047
- "eval_steps_per_second": 0.14,
2048
- "step": 6000
2049
- },
2050
- {
2051
- "epoch": 2.0,
2052
- "learning_rate": 0.0,
2053
- "loss": 1.7897,
2054
- "step": 6020
2055
- },
2056
- {
2057
- "epoch": 2.0,
2058
- "learning_rate": 0.0,
2059
- "loss": 1.7844,
2060
- "step": 6040
2061
- },
2062
- {
2063
- "epoch": 2.01,
2064
- "learning_rate": 0.0,
2065
- "loss": 1.8453,
2066
- "step": 6060
2067
- },
2068
- {
2069
- "epoch": 2.02,
2070
- "learning_rate": 0.0,
2071
- "loss": 1.8268,
2072
- "step": 6080
2073
- },
2074
- {
2075
- "epoch": 2.02,
2076
- "learning_rate": 0.0,
2077
- "loss": 1.7932,
2078
- "step": 6100
2079
- },
2080
- {
2081
- "epoch": 2.03,
2082
- "learning_rate": 0.0,
2083
- "loss": 1.8005,
2084
- "step": 6120
2085
- },
2086
- {
2087
- "epoch": 2.04,
2088
- "learning_rate": 0.0,
2089
- "loss": 1.773,
2090
- "step": 6140
2091
- },
2092
- {
2093
- "epoch": 2.04,
2094
- "learning_rate": 0.0,
2095
- "loss": 1.8029,
2096
- "step": 6160
2097
- },
2098
- {
2099
- "epoch": 2.05,
2100
- "learning_rate": 0.0,
2101
- "loss": 1.8283,
2102
- "step": 6180
2103
- },
2104
- {
2105
- "epoch": 2.06,
2106
- "learning_rate": 0.0,
2107
- "loss": 1.8167,
2108
- "step": 6200
2109
- },
2110
- {
2111
- "epoch": 2.06,
2112
- "eval_loss": 1.970503568649292,
2113
- "eval_runtime": 11950.9199,
2114
- "eval_samples_per_second": 1.119,
2115
- "eval_steps_per_second": 0.14,
2116
- "step": 6200
2117
- },
2118
- {
2119
- "epoch": 2.06,
2120
- "learning_rate": 0.0,
2121
- "loss": 1.8465,
2122
- "step": 6220
2123
- },
2124
- {
2125
- "epoch": 2.07,
2126
- "learning_rate": 0.0,
2127
- "loss": 1.8747,
2128
- "step": 6240
2129
- },
2130
- {
2131
- "epoch": 2.08,
2132
- "learning_rate": 0.0,
2133
- "loss": 1.8031,
2134
- "step": 6260
2135
- },
2136
- {
2137
- "epoch": 2.08,
2138
- "learning_rate": 0.0,
2139
- "loss": 1.8366,
2140
- "step": 6280
2141
- },
2142
- {
2143
- "epoch": 2.09,
2144
- "learning_rate": 0.0,
2145
- "loss": 1.7998,
2146
- "step": 6300
2147
- },
2148
- {
2149
- "epoch": 2.1,
2150
- "learning_rate": 0.0,
2151
- "loss": 1.8143,
2152
- "step": 6320
2153
- },
2154
- {
2155
- "epoch": 2.1,
2156
- "learning_rate": 0.0,
2157
- "loss": 1.8586,
2158
- "step": 6340
2159
- },
2160
- {
2161
- "epoch": 2.11,
2162
- "learning_rate": 0.0,
2163
- "loss": 1.836,
2164
- "step": 6360
2165
- },
2166
- {
2167
- "epoch": 2.12,
2168
- "learning_rate": 0.0,
2169
- "loss": 1.863,
2170
- "step": 6380
2171
- },
2172
- {
2173
- "epoch": 2.12,
2174
- "learning_rate": 0.0,
2175
- "loss": 1.7667,
2176
- "step": 6400
2177
- },
2178
- {
2179
- "epoch": 2.12,
2180
- "eval_loss": 1.970503568649292,
2181
- "eval_runtime": 11949.7143,
2182
- "eval_samples_per_second": 1.119,
2183
- "eval_steps_per_second": 0.14,
2184
- "step": 6400
2185
- },
2186
- {
2187
- "epoch": 2.13,
2188
- "learning_rate": 0.0,
2189
- "loss": 1.8187,
2190
- "step": 6420
2191
- },
2192
- {
2193
- "epoch": 2.14,
2194
- "learning_rate": 0.0,
2195
- "loss": 1.7883,
2196
- "step": 6440
2197
- },
2198
- {
2199
- "epoch": 2.14,
2200
- "learning_rate": 0.0,
2201
- "loss": 1.9299,
2202
- "step": 6460
2203
- },
2204
- {
2205
- "epoch": 2.15,
2206
- "learning_rate": 0.0,
2207
- "loss": 1.8286,
2208
- "step": 6480
2209
- },
2210
- {
2211
- "epoch": 2.16,
2212
- "learning_rate": 0.0,
2213
- "loss": 1.8181,
2214
- "step": 6500
2215
- },
2216
- {
2217
- "epoch": 2.16,
2218
- "learning_rate": 0.0,
2219
- "loss": 1.7856,
2220
- "step": 6520
2221
- },
2222
- {
2223
- "epoch": 2.17,
2224
- "learning_rate": 0.0,
2225
- "loss": 1.7192,
2226
- "step": 6540
2227
- },
2228
- {
2229
- "epoch": 2.18,
2230
- "learning_rate": 0.0,
2231
- "loss": 1.7715,
2232
- "step": 6560
2233
- },
2234
- {
2235
- "epoch": 2.18,
2236
- "learning_rate": 0.0,
2237
- "loss": 1.8359,
2238
- "step": 6580
2239
- },
2240
- {
2241
- "epoch": 2.19,
2242
- "learning_rate": 0.0,
2243
- "loss": 1.7989,
2244
- "step": 6600
2245
- },
2246
- {
2247
- "epoch": 2.19,
2248
- "eval_loss": 1.970503568649292,
2249
- "eval_runtime": 11949.8024,
2250
- "eval_samples_per_second": 1.119,
2251
- "eval_steps_per_second": 0.14,
2252
- "step": 6600
2253
- },
2254
- {
2255
- "epoch": 2.2,
2256
- "learning_rate": 0.0,
2257
- "loss": 1.7869,
2258
- "step": 6620
2259
- },
2260
- {
2261
- "epoch": 2.2,
2262
- "learning_rate": 0.0,
2263
- "loss": 1.7855,
2264
- "step": 6640
2265
- },
2266
- {
2267
- "epoch": 2.21,
2268
- "learning_rate": 0.0,
2269
- "loss": 1.8312,
2270
- "step": 6660
2271
- },
2272
- {
2273
- "epoch": 2.22,
2274
- "learning_rate": 0.0,
2275
- "loss": 1.7717,
2276
- "step": 6680
2277
- },
2278
- {
2279
- "epoch": 2.22,
2280
- "learning_rate": 0.0,
2281
- "loss": 1.7621,
2282
- "step": 6700
2283
- },
2284
- {
2285
- "epoch": 2.23,
2286
- "learning_rate": 0.0,
2287
- "loss": 1.8031,
2288
- "step": 6720
2289
- },
2290
- {
2291
- "epoch": 2.24,
2292
- "learning_rate": 0.0,
2293
- "loss": 1.8061,
2294
- "step": 6740
2295
- },
2296
- {
2297
- "epoch": 2.24,
2298
- "learning_rate": 0.0,
2299
- "loss": 1.8063,
2300
- "step": 6760
2301
- },
2302
- {
2303
- "epoch": 2.25,
2304
- "learning_rate": 0.0,
2305
- "loss": 1.7469,
2306
- "step": 6780
2307
- },
2308
- {
2309
- "epoch": 2.26,
2310
- "learning_rate": 0.0,
2311
- "loss": 1.7543,
2312
- "step": 6800
2313
- },
2314
- {
2315
- "epoch": 2.26,
2316
- "eval_loss": 1.970503568649292,
2317
- "eval_runtime": 11954.2042,
2318
- "eval_samples_per_second": 1.118,
2319
- "eval_steps_per_second": 0.14,
2320
- "step": 6800
2321
- },
2322
- {
2323
- "epoch": 2.26,
2324
- "learning_rate": 0.0,
2325
- "loss": 1.8659,
2326
- "step": 6820
2327
- },
2328
- {
2329
- "epoch": 2.27,
2330
- "learning_rate": 0.0,
2331
- "loss": 1.7935,
2332
- "step": 6840
2333
- },
2334
- {
2335
- "epoch": 2.28,
2336
- "learning_rate": 0.0,
2337
- "loss": 1.8247,
2338
- "step": 6860
2339
- },
2340
- {
2341
- "epoch": 2.28,
2342
- "learning_rate": 0.0,
2343
- "loss": 1.8339,
2344
- "step": 6880
2345
- },
2346
- {
2347
- "epoch": 2.29,
2348
- "learning_rate": 0.0,
2349
- "loss": 1.8419,
2350
- "step": 6900
2351
- },
2352
- {
2353
- "epoch": 2.3,
2354
- "learning_rate": 0.0,
2355
- "loss": 1.7978,
2356
- "step": 6920
2357
- },
2358
- {
2359
- "epoch": 2.3,
2360
- "learning_rate": 0.0,
2361
- "loss": 1.8048,
2362
- "step": 6940
2363
- },
2364
- {
2365
- "epoch": 2.31,
2366
- "learning_rate": 0.0,
2367
- "loss": 1.8523,
2368
- "step": 6960
2369
- },
2370
- {
2371
- "epoch": 2.32,
2372
- "learning_rate": 0.0,
2373
- "loss": 1.7875,
2374
- "step": 6980
2375
- },
2376
- {
2377
- "epoch": 2.32,
2378
- "learning_rate": 0.0,
2379
- "loss": 1.8262,
2380
- "step": 7000
2381
- },
2382
- {
2383
- "epoch": 2.32,
2384
- "eval_loss": 1.970503568649292,
2385
- "eval_runtime": 11967.8262,
2386
- "eval_samples_per_second": 1.117,
2387
- "eval_steps_per_second": 0.14,
2388
- "step": 7000
2389
- },
2390
- {
2391
- "epoch": 2.33,
2392
- "learning_rate": 0.0,
2393
- "loss": 1.8661,
2394
- "step": 7020
2395
- },
2396
- {
2397
- "epoch": 2.33,
2398
- "learning_rate": 0.0,
2399
- "loss": 1.8136,
2400
- "step": 7040
2401
- },
2402
- {
2403
- "epoch": 2.34,
2404
- "learning_rate": 0.0,
2405
- "loss": 1.8066,
2406
- "step": 7060
2407
- },
2408
- {
2409
- "epoch": 2.35,
2410
- "learning_rate": 0.0,
2411
- "loss": 1.8355,
2412
- "step": 7080
2413
- },
2414
- {
2415
- "epoch": 2.35,
2416
- "learning_rate": 0.0,
2417
- "loss": 1.7598,
2418
- "step": 7100
2419
- },
2420
- {
2421
- "epoch": 2.36,
2422
- "learning_rate": 0.0,
2423
- "loss": 1.8384,
2424
- "step": 7120
2425
- },
2426
- {
2427
- "epoch": 2.37,
2428
- "learning_rate": 0.0,
2429
- "loss": 1.7768,
2430
- "step": 7140
2431
- },
2432
- {
2433
- "epoch": 2.37,
2434
- "learning_rate": 0.0,
2435
- "loss": 1.8371,
2436
- "step": 7160
2437
- },
2438
- {
2439
- "epoch": 2.38,
2440
- "learning_rate": 0.0,
2441
- "loss": 1.7989,
2442
- "step": 7180
2443
- },
2444
- {
2445
- "epoch": 2.39,
2446
- "learning_rate": 0.0,
2447
- "loss": 1.8204,
2448
- "step": 7200
2449
- },
2450
- {
2451
- "epoch": 2.39,
2452
- "eval_loss": 1.970503568649292,
2453
- "eval_runtime": 11959.9088,
2454
- "eval_samples_per_second": 1.118,
2455
- "eval_steps_per_second": 0.14,
2456
- "step": 7200
2457
- },
2458
- {
2459
- "epoch": 2.39,
2460
- "learning_rate": 0.0,
2461
- "loss": 1.8632,
2462
- "step": 7220
2463
- },
2464
- {
2465
- "epoch": 2.4,
2466
- "learning_rate": 0.0,
2467
- "loss": 1.838,
2468
- "step": 7240
2469
- },
2470
- {
2471
- "epoch": 2.41,
2472
- "learning_rate": 0.0,
2473
- "loss": 1.8492,
2474
- "step": 7260
2475
- },
2476
- {
2477
- "epoch": 2.41,
2478
- "learning_rate": 0.0,
2479
- "loss": 1.8213,
2480
- "step": 7280
2481
- },
2482
- {
2483
- "epoch": 2.42,
2484
- "learning_rate": 0.0,
2485
- "loss": 1.7367,
2486
- "step": 7300
2487
- },
2488
- {
2489
- "epoch": 2.43,
2490
- "learning_rate": 0.0,
2491
- "loss": 1.9046,
2492
- "step": 7320
2493
- },
2494
- {
2495
- "epoch": 2.43,
2496
- "learning_rate": 0.0,
2497
- "loss": 1.7799,
2498
- "step": 7340
2499
- },
2500
- {
2501
- "epoch": 2.44,
2502
- "learning_rate": 0.0,
2503
- "loss": 1.793,
2504
- "step": 7360
2505
- },
2506
- {
2507
- "epoch": 2.45,
2508
- "learning_rate": 0.0,
2509
- "loss": 1.7864,
2510
- "step": 7380
2511
- },
2512
- {
2513
- "epoch": 2.45,
2514
- "learning_rate": 0.0,
2515
- "loss": 1.8071,
2516
- "step": 7400
2517
- },
2518
- {
2519
- "epoch": 2.45,
2520
- "eval_loss": 1.970503568649292,
2521
- "eval_runtime": 11949.9374,
2522
- "eval_samples_per_second": 1.119,
2523
- "eval_steps_per_second": 0.14,
2524
- "step": 7400
2525
- },
2526
- {
2527
- "epoch": 2.46,
2528
- "learning_rate": 0.0,
2529
- "loss": 1.7708,
2530
- "step": 7420
2531
- },
2532
- {
2533
- "epoch": 2.47,
2534
- "learning_rate": 0.0,
2535
- "loss": 1.8234,
2536
- "step": 7440
2537
- },
2538
- {
2539
- "epoch": 2.47,
2540
- "learning_rate": 0.0,
2541
- "loss": 1.8214,
2542
- "step": 7460
2543
- },
2544
- {
2545
- "epoch": 2.48,
2546
- "learning_rate": 0.0,
2547
- "loss": 1.8468,
2548
- "step": 7480
2549
- },
2550
- {
2551
- "epoch": 2.49,
2552
- "learning_rate": 0.0,
2553
- "loss": 1.784,
2554
- "step": 7500
2555
- },
2556
- {
2557
- "epoch": 2.49,
2558
- "learning_rate": 0.0,
2559
- "loss": 1.8571,
2560
- "step": 7520
2561
- },
2562
- {
2563
- "epoch": 2.5,
2564
- "learning_rate": 0.0,
2565
- "loss": 1.7871,
2566
- "step": 7540
2567
- },
2568
- {
2569
- "epoch": 2.51,
2570
- "learning_rate": 0.0,
2571
- "loss": 1.7819,
2572
- "step": 7560
2573
- },
2574
- {
2575
- "epoch": 2.51,
2576
- "learning_rate": 0.0,
2577
- "loss": 1.8035,
2578
- "step": 7580
2579
- },
2580
- {
2581
- "epoch": 2.52,
2582
- "learning_rate": 0.0,
2583
- "loss": 1.801,
2584
- "step": 7600
2585
- },
2586
- {
2587
- "epoch": 2.52,
2588
- "eval_loss": 1.970503568649292,
2589
- "eval_runtime": 11949.3183,
2590
- "eval_samples_per_second": 1.119,
2591
- "eval_steps_per_second": 0.14,
2592
- "step": 7600
2593
- },
2594
- {
2595
- "epoch": 2.53,
2596
- "learning_rate": 0.0,
2597
- "loss": 1.8302,
2598
- "step": 7620
2599
- },
2600
- {
2601
- "epoch": 2.53,
2602
- "learning_rate": 0.0,
2603
- "loss": 1.8208,
2604
- "step": 7640
2605
- },
2606
- {
2607
- "epoch": 2.54,
2608
- "learning_rate": 0.0,
2609
- "loss": 1.917,
2610
- "step": 7660
2611
- },
2612
- {
2613
- "epoch": 2.55,
2614
- "learning_rate": 0.0,
2615
- "loss": 1.8184,
2616
- "step": 7680
2617
- },
2618
- {
2619
- "epoch": 2.55,
2620
- "learning_rate": 0.0,
2621
- "loss": 1.7462,
2622
- "step": 7700
2623
- },
2624
- {
2625
- "epoch": 2.56,
2626
- "learning_rate": 0.0,
2627
- "loss": 1.7699,
2628
- "step": 7720
2629
- },
2630
- {
2631
- "epoch": 2.57,
2632
- "learning_rate": 0.0,
2633
- "loss": 1.8377,
2634
- "step": 7740
2635
- },
2636
- {
2637
- "epoch": 2.57,
2638
- "learning_rate": 0.0,
2639
- "loss": 1.7713,
2640
- "step": 7760
2641
- },
2642
- {
2643
- "epoch": 2.58,
2644
- "learning_rate": 0.0,
2645
- "loss": 1.798,
2646
- "step": 7780
2647
- },
2648
- {
2649
- "epoch": 2.59,
2650
- "learning_rate": 0.0,
2651
- "loss": 1.8425,
2652
- "step": 7800
2653
- },
2654
- {
2655
- "epoch": 2.59,
2656
- "eval_loss": 1.970503568649292,
2657
- "eval_runtime": 11947.7265,
2658
- "eval_samples_per_second": 1.119,
2659
- "eval_steps_per_second": 0.14,
2660
- "step": 7800
2661
- },
2662
- {
2663
- "epoch": 2.59,
2664
- "learning_rate": 0.0,
2665
- "loss": 1.7816,
2666
- "step": 7820
2667
- },
2668
- {
2669
- "epoch": 2.6,
2670
- "learning_rate": 0.0,
2671
- "loss": 1.8193,
2672
- "step": 7840
2673
- },
2674
- {
2675
- "epoch": 2.61,
2676
- "learning_rate": 0.0,
2677
- "loss": 1.8044,
2678
- "step": 7860
2679
- },
2680
- {
2681
- "epoch": 2.61,
2682
- "learning_rate": 0.0,
2683
- "loss": 1.7937,
2684
- "step": 7880
2685
- },
2686
- {
2687
- "epoch": 2.62,
2688
- "learning_rate": 0.0,
2689
- "loss": 1.8224,
2690
- "step": 7900
2691
- },
2692
- {
2693
- "epoch": 2.63,
2694
- "learning_rate": 0.0,
2695
- "loss": 1.7937,
2696
- "step": 7920
2697
- },
2698
- {
2699
- "epoch": 2.63,
2700
- "learning_rate": 0.0,
2701
- "loss": 1.7988,
2702
- "step": 7940
2703
- },
2704
- {
2705
- "epoch": 2.64,
2706
- "learning_rate": 0.0,
2707
- "loss": 1.8622,
2708
- "step": 7960
2709
- },
2710
- {
2711
- "epoch": 2.65,
2712
- "learning_rate": 0.0,
2713
- "loss": 1.8407,
2714
- "step": 7980
2715
- },
2716
- {
2717
- "epoch": 2.65,
2718
- "learning_rate": 0.0,
2719
- "loss": 1.7578,
2720
- "step": 8000
2721
- },
2722
- {
2723
- "epoch": 2.65,
2724
- "eval_loss": 1.970503568649292,
2725
- "eval_runtime": 11948.4274,
2726
- "eval_samples_per_second": 1.119,
2727
- "eval_steps_per_second": 0.14,
2728
- "step": 8000
2729
- },
2730
- {
2731
- "epoch": 2.66,
2732
- "learning_rate": 0.0,
2733
- "loss": 1.789,
2734
- "step": 8020
2735
- },
2736
- {
2737
- "epoch": 2.67,
2738
- "learning_rate": 0.0,
2739
- "loss": 1.7939,
2740
- "step": 8040
2741
- },
2742
- {
2743
- "epoch": 2.67,
2744
- "learning_rate": 0.0,
2745
- "loss": 1.7694,
2746
- "step": 8060
2747
- },
2748
- {
2749
- "epoch": 2.68,
2750
- "learning_rate": 0.0,
2751
- "loss": 1.7917,
2752
- "step": 8080
2753
- },
2754
- {
2755
- "epoch": 2.69,
2756
- "learning_rate": 0.0,
2757
- "loss": 1.7737,
2758
- "step": 8100
2759
- },
2760
- {
2761
- "epoch": 2.69,
2762
- "learning_rate": 0.0,
2763
- "loss": 1.8049,
2764
- "step": 8120
2765
- },
2766
- {
2767
- "epoch": 2.7,
2768
- "learning_rate": 0.0,
2769
- "loss": 1.8419,
2770
- "step": 8140
2771
- },
2772
- {
2773
- "epoch": 2.71,
2774
- "learning_rate": 0.0,
2775
- "loss": 1.7835,
2776
- "step": 8160
2777
- },
2778
- {
2779
- "epoch": 2.71,
2780
- "learning_rate": 0.0,
2781
- "loss": 1.7898,
2782
- "step": 8180
2783
- },
2784
- {
2785
- "epoch": 2.72,
2786
- "learning_rate": 0.0,
2787
- "loss": 1.7798,
2788
- "step": 8200
2789
- },
2790
- {
2791
- "epoch": 2.72,
2792
- "eval_loss": 1.970503568649292,
2793
- "eval_runtime": 11959.3563,
2794
- "eval_samples_per_second": 1.118,
2795
- "eval_steps_per_second": 0.14,
2796
- "step": 8200
2797
- },
2798
- {
2799
- "epoch": 2.73,
2800
- "learning_rate": 0.0,
2801
- "loss": 1.7904,
2802
- "step": 8220
2803
- },
2804
- {
2805
- "epoch": 2.73,
2806
- "learning_rate": 0.0,
2807
- "loss": 1.8054,
2808
- "step": 8240
2809
- },
2810
- {
2811
- "epoch": 2.74,
2812
- "learning_rate": 0.0,
2813
- "loss": 1.8316,
2814
- "step": 8260
2815
- },
2816
- {
2817
- "epoch": 2.75,
2818
- "learning_rate": 0.0,
2819
- "loss": 1.8059,
2820
- "step": 8280
2821
- },
2822
- {
2823
- "epoch": 2.75,
2824
- "learning_rate": 0.0,
2825
- "loss": 1.7908,
2826
- "step": 8300
2827
- },
2828
- {
2829
- "epoch": 2.76,
2830
- "learning_rate": 0.0,
2831
- "loss": 1.8092,
2832
- "step": 8320
2833
- },
2834
- {
2835
- "epoch": 2.77,
2836
- "learning_rate": 0.0,
2837
- "loss": 1.8954,
2838
- "step": 8340
2839
- },
2840
- {
2841
- "epoch": 2.77,
2842
- "learning_rate": 0.0,
2843
- "loss": 1.835,
2844
- "step": 8360
2845
- },
2846
- {
2847
- "epoch": 2.78,
2848
- "learning_rate": 0.0,
2849
- "loss": 1.8018,
2850
- "step": 8380
2851
- },
2852
- {
2853
- "epoch": 2.79,
2854
- "learning_rate": 0.0,
2855
- "loss": 1.8474,
2856
- "step": 8400
2857
- },
2858
- {
2859
- "epoch": 2.79,
2860
- "eval_loss": 1.970503568649292,
2861
- "eval_runtime": 11978.111,
2862
- "eval_samples_per_second": 1.116,
2863
- "eval_steps_per_second": 0.14,
2864
- "step": 8400
2865
- },
2866
- {
2867
- "epoch": 2.79,
2868
- "learning_rate": 0.0,
2869
- "loss": 1.7878,
2870
- "step": 8420
2871
- },
2872
- {
2873
- "epoch": 2.8,
2874
- "learning_rate": 0.0,
2875
- "loss": 1.7629,
2876
- "step": 8440
2877
- },
2878
- {
2879
- "epoch": 2.81,
2880
- "learning_rate": 0.0,
2881
- "loss": 1.8068,
2882
- "step": 8460
2883
- },
2884
- {
2885
- "epoch": 2.81,
2886
- "learning_rate": 0.0,
2887
- "loss": 1.7907,
2888
- "step": 8480
2889
- },
2890
- {
2891
- "epoch": 2.82,
2892
- "learning_rate": 0.0,
2893
- "loss": 1.7598,
2894
- "step": 8500
2895
- },
2896
- {
2897
- "epoch": 2.83,
2898
- "learning_rate": 0.0,
2899
- "loss": 1.7964,
2900
- "step": 8520
2901
- },
2902
- {
2903
- "epoch": 2.83,
2904
- "learning_rate": 0.0,
2905
- "loss": 1.7951,
2906
- "step": 8540
2907
- },
2908
- {
2909
- "epoch": 2.84,
2910
- "learning_rate": 0.0,
2911
- "loss": 1.7962,
2912
- "step": 8560
2913
- },
2914
- {
2915
- "epoch": 2.85,
2916
- "learning_rate": 0.0,
2917
- "loss": 1.7908,
2918
- "step": 8580
2919
- },
2920
- {
2921
- "epoch": 2.85,
2922
- "learning_rate": 0.0,
2923
- "loss": 1.7525,
2924
- "step": 8600
2925
- },
2926
- {
2927
- "epoch": 2.85,
2928
- "eval_loss": 1.970503568649292,
2929
- "eval_runtime": 11977.8941,
2930
- "eval_samples_per_second": 1.116,
2931
- "eval_steps_per_second": 0.14,
2932
- "step": 8600
2933
- },
2934
- {
2935
- "epoch": 2.86,
2936
- "learning_rate": 0.0,
2937
- "loss": 1.7268,
2938
- "step": 8620
2939
- },
2940
- {
2941
- "epoch": 2.87,
2942
- "learning_rate": 0.0,
2943
- "loss": 1.7716,
2944
- "step": 8640
2945
- },
2946
- {
2947
- "epoch": 2.87,
2948
- "learning_rate": 0.0,
2949
- "loss": 1.8214,
2950
- "step": 8660
2951
- },
2952
- {
2953
- "epoch": 2.88,
2954
- "learning_rate": 0.0,
2955
- "loss": 1.8116,
2956
- "step": 8680
2957
- },
2958
- {
2959
- "epoch": 2.89,
2960
- "learning_rate": 0.0,
2961
- "loss": 1.8204,
2962
- "step": 8700
2963
- },
2964
- {
2965
- "epoch": 2.89,
2966
- "learning_rate": 0.0,
2967
- "loss": 1.7878,
2968
- "step": 8720
2969
- },
2970
- {
2971
- "epoch": 2.9,
2972
- "learning_rate": 0.0,
2973
- "loss": 1.8828,
2974
- "step": 8740
2975
- },
2976
- {
2977
- "epoch": 2.91,
2978
- "learning_rate": 0.0,
2979
- "loss": 1.8015,
2980
- "step": 8760
2981
- },
2982
- {
2983
- "epoch": 2.91,
2984
- "learning_rate": 0.0,
2985
- "loss": 1.7989,
2986
- "step": 8780
2987
- },
2988
- {
2989
- "epoch": 2.92,
2990
- "learning_rate": 0.0,
2991
- "loss": 1.7467,
2992
- "step": 8800
2993
- },
2994
- {
2995
- "epoch": 2.92,
2996
- "eval_loss": 1.970503568649292,
2997
- "eval_runtime": 11960.8065,
2998
- "eval_samples_per_second": 1.118,
2999
- "eval_steps_per_second": 0.14,
3000
- "step": 8800
3001
- },
3002
- {
3003
- "epoch": 2.93,
3004
- "learning_rate": 0.0,
3005
- "loss": 1.7823,
3006
- "step": 8820
3007
- },
3008
- {
3009
- "epoch": 2.93,
3010
- "learning_rate": 0.0,
3011
- "loss": 1.8734,
3012
- "step": 8840
3013
- },
3014
- {
3015
- "epoch": 2.94,
3016
- "learning_rate": 0.0,
3017
- "loss": 1.8192,
3018
- "step": 8860
3019
- },
3020
- {
3021
- "epoch": 2.95,
3022
- "learning_rate": 0.0,
3023
- "loss": 1.8,
3024
- "step": 8880
3025
- },
3026
- {
3027
- "epoch": 2.95,
3028
- "learning_rate": 0.0,
3029
- "loss": 1.8057,
3030
- "step": 8900
3031
- },
3032
- {
3033
- "epoch": 2.96,
3034
- "learning_rate": 0.0,
3035
- "loss": 1.8007,
3036
- "step": 8920
3037
- },
3038
- {
3039
- "epoch": 2.97,
3040
- "learning_rate": 0.0,
3041
- "loss": 1.7826,
3042
- "step": 8940
3043
- },
3044
- {
3045
- "epoch": 2.97,
3046
- "learning_rate": 0.0,
3047
- "loss": 1.8612,
3048
- "step": 8960
3049
- },
3050
- {
3051
- "epoch": 2.98,
3052
- "learning_rate": 0.0,
3053
- "loss": 1.7843,
3054
- "step": 8980
3055
- },
3056
- {
3057
- "epoch": 2.99,
3058
- "learning_rate": 0.0,
3059
- "loss": 1.8619,
3060
- "step": 9000
3061
- },
3062
- {
3063
- "epoch": 2.99,
3064
- "eval_loss": 1.970503568649292,
3065
- "eval_runtime": 11953.5723,
3066
- "eval_samples_per_second": 1.118,
3067
- "eval_steps_per_second": 0.14,
3068
- "step": 9000
3069
- }
3070
- ],
3071
- "max_steps": 9045,
3072
- "num_train_epochs": 3,
3073
- "total_flos": 5.6126739980068454e+17,
3074
- "trial_name": null,
3075
- "trial_params": null
3076
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail-results_6000_samples/checkpoint-9000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5a78612ea8930d68eba4cb53d62254ccf547582e754aa049d169c3c11dd5fe4
3
- size 4027
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/adapter_config.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "base_model_name_or_path": "/domino/edv/afs-mrmc-data-store-rw/innovation/hf/RedPajama-INCITE-7B-Base",
3
- "bias": "none",
4
- "fan_in_fan_out": false,
5
- "inference_mode": true,
6
- "init_lora_weights": true,
7
- "lora_alpha": 16,
8
- "lora_dropout": 0.05,
9
- "modules_to_save": null,
10
- "peft_type": "LORA",
11
- "r": 8,
12
- "target_modules": [
13
- "query_key_value"
14
- ],
15
- "task_type": "CAUSAL_LM"
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7004b69a5e5338ee5dbc682271079d0cf6750a64f3dd06f0dabf0a4c8129f41b
3
- size 16800753
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<eos>",
4
- "pad_token": "<|endoftext|>",
5
- "unk_token": "<|endoftext|>"
6
- }
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
redpj7B-lora-cnn-dailymail_6000_samples/results/redpj7B-lora-cnn-dailymail_6000_samples/tokenizer_config.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "add_eos_token": true,
3
- "add_prefix_space": false,
4
- "bos_token": "<|endoftext|>",
5
- "clean_up_tokenization_spaces": true,
6
- "eos_token": "<|endoftext|>",
7
- "model_max_length": 2048,
8
- "tokenizer_class": "GPTNeoXTokenizer",
9
- "unk_token": "<|endoftext|>"
10
- }
 
 
 
 
 
 
 
 
 
 
 
redpj7B-lora-cnn-dailymail_6000_samples/results/stdout.txt DELETED
The diff for this file is too large to render. See raw diff
 
redpj7B-lora-cnn-dailymail_6000_samples/script_fine_tuning.py DELETED
@@ -1,170 +0,0 @@
1
- afs_path = '/domino/edv/afs-mrmc-data-store-rw/innovation/hf/'
2
-
3
- import datasets
4
- from datasets import load_dataset
5
- import numpy as np
6
-
7
- from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType, PeftModel
8
-
9
- import transformers
10
- import torch
11
- print('transformers version: '+transformers.__version__)
12
- #print('tensorflow version: '+tf.__version__)
13
- print('torch version: '+torch.__version__)
14
-
15
-
16
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
17
-
18
-
19
- model = '7B' #'7B' # Pick your poison
20
-
21
- if model == '7B':
22
- model_name = ("RedPajama-INCITE-7B-Base","RedPajama-INCITE-7B-Base")
23
- run_name = 'redpj7B-lora-cnn-dailymail_fine_tune_test'
24
- dataset = 'cnn_dailymail'
25
- peft_name = './results/redpj7B-lora-cnn-dailymail_fine_tune_test'
26
- output_dir = './results/redpj7B-lora-cnn-dailymail-results_fine_tune_test'
27
- else: #3B
28
- model_name = ("RedPajama-INCITE-Base-3B-v1","RedPajama-INCITE-Base-3B-v1")
29
- run_name = 'redpj3B-lora-cnn-dailymail_fine_tune_test'
30
- dataset = 'cnn_dailymail'
31
- peft_name = './results/redpj3B-lora-cnn-dailymail_fine_tune_test'
32
- output_dir = './results/redpj3B-lora-cnn-dailymail-results_fine_tune_test'
33
-
34
- print(f"""model_name: {model_name[1]}, dataset: {dataset}, peft_name {peft_name}, run_name {run_name}, output_dir {output_dir}""")
35
-
36
-
37
- from transformers import AutoTokenizer
38
-
39
- print("Loading tokenizer for model: ", model_name[1])
40
- tokenizer = AutoTokenizer.from_pretrained(afs_path+model_name[1],add_eos_token=True)
41
- tokenizer.pad_token_id = 0
42
-
43
- tokenizer.add_special_tokens({'eos_token':'<eos>'})
44
- print('eos_token_id:',tokenizer.eos_token_id)
45
-
46
- #CUTOFF_LEN = 256 # 256 accounts for about 96% of the data in the alpaca dataset
47
- CUTOFF_LEN = 781 # 781 is the average token count for the articles according to https://huggingface.co/datasets/cnn_dailymail
48
-
49
-
50
- def tokenize(prompt, tokenizer,add_eos_token=True):
51
- result = tokenizer(
52
- prompt+"<eos>", # add the end-of-stream token
53
- truncation=True,
54
- max_length=CUTOFF_LEN,
55
- padding="max_length",
56
- )
57
- return {
58
- "input_ids": result["input_ids"],
59
- "attention_mask": result["attention_mask"],
60
- }
61
-
62
-
63
-
64
- data = datasets.load_from_disk('cnn_dailymail_dataset')
65
-
66
- num_train_examples = len(data['train'])
67
-
68
- # Define the percentage of data you want to keep
69
- percentage_to_keep = 0.02 # Adjust this value to your desired percentage (0.02 is about 6k samples)
70
-
71
- # Calculate the number of examples to keep
72
- num_examples_to_keep = int(num_train_examples * percentage_to_keep)
73
-
74
- # Reduce the 'train' split to the desired amount
75
- train_data_reduced = data['train'].select(range(num_examples_to_keep))
76
-
77
- #train_data_reduced.save_to_disk("./cnn_dailymail_dataset/train_data_reduced")
78
-
79
- def generate_prompt(data_point):
80
- # sorry about the formatting disaster gotta move fast
81
- if data_point["article"]:
82
- return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
83
-
84
- ### Instruction:
85
- Summarize the text from the input.
86
-
87
- ### Input:
88
- {data_point["article"]}
89
-
90
- ### Response:
91
- {data_point["highlights"]}"""
92
- else:
93
- return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
94
-
95
- ### Instruction:
96
- Summarize the text.
97
-
98
- ### Response:
99
- {data_point["highlights"]}"""
100
-
101
- train_data = data["train"]
102
- val_data = data["validation"]
103
-
104
- #train_data = train_data.map(lambda x: tokenize(generate_prompt(x), tokenizer))
105
- train_data = train_data_reduced.map(lambda x: tokenize(generate_prompt(x), tokenizer)) # use reduced train set
106
- val_data = val_data.map(lambda x: tokenize(generate_prompt(x), tokenizer))
107
-
108
- from transformers import AutoModelForCausalLM
109
-
110
- print("Loading model for model: ", model_name[0])
111
-
112
- model = AutoModelForCausalLM.from_pretrained(
113
- afs_path+model_name[0],
114
- load_in_8bit=False, # changed from True to False
115
- device_map="auto",
116
- )
117
-
118
-
119
-
120
- # Define LoRA Config
121
- lora_config = LoraConfig(
122
- r= 8,
123
- lora_alpha=16,
124
- target_modules=["query_key_value"],
125
- lora_dropout=0.05,
126
- bias="none",
127
- task_type=TaskType.CAUSAL_LM
128
- )
129
-
130
-
131
- # prepare int-8 model for training
132
- #model = prepare_model_for_int8_training(model) #uncomment for int8
133
-
134
- # add LoRA adaptor
135
- model = get_peft_model(model, lora_config)
136
-
137
- eval_steps = 200
138
- save_steps = 200
139
- logging_steps = 20
140
-
141
- trainer = transformers.Trainer(
142
- model=model,
143
- train_dataset=train_data,
144
- eval_dataset=val_data,
145
- args=transformers.TrainingArguments(
146
- num_train_epochs=3,
147
- learning_rate=3e-4,
148
- logging_steps=logging_steps,
149
- logging_dir='./results', # directory for storing logs
150
- evaluation_strategy="steps",
151
- save_strategy="steps",
152
- eval_steps=eval_steps,
153
- save_steps=save_steps,
154
- output_dir=output_dir,
155
- report_to="none", #changed from report_to if report_to else to "none"
156
- save_total_limit=3,
157
- load_best_model_at_end=True,
158
- push_to_hub=False,
159
- auto_find_batch_size=True
160
- ),
161
- data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
162
- )
163
-
164
- model.config.use_cache = False # silence the warnings. Please re-enable for inference!
165
-
166
- trainer.train()
167
-
168
- # Save our LoRA model & tokenizer results
169
- trainer.model.save_pretrained(peft_name)
170
- tokenizer.save_pretrained(peft_name)