Santipab commited on
Commit
4595e44
·
verified ·
1 Parent(s): 40ea74a

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/resnet-50",
3
+ "architectures": [
4
+ "ResNetForImageClassification"
5
+ ],
6
+ "depths": [
7
+ 3,
8
+ 4,
9
+ 6,
10
+ 3
11
+ ],
12
+ "downsample_in_bottleneck": false,
13
+ "downsample_in_first_stage": false,
14
+ "embedding_size": 64,
15
+ "hidden_act": "relu",
16
+ "hidden_sizes": [
17
+ 256,
18
+ 512,
19
+ 1024,
20
+ 2048
21
+ ],
22
+ "id2label": {
23
+ "0": "COVID",
24
+ "1": "Lung opacity",
25
+ "2": "Normal",
26
+ "3": "Viral Pneumonia"
27
+ },
28
+ "label2id": {
29
+ "COVID": 0,
30
+ "Lung opacity": 1,
31
+ "Normal": 2,
32
+ "Viral Pneumonia": 3
33
+ },
34
+ "layer_type": "bottleneck",
35
+ "model_type": "resnet",
36
+ "num_channels": 3,
37
+ "out_features": [
38
+ "stage4"
39
+ ],
40
+ "out_indices": [
41
+ 4
42
+ ],
43
+ "problem_type": "single_label_classification",
44
+ "stage_names": [
45
+ "stem",
46
+ "stage1",
47
+ "stage2",
48
+ "stage3",
49
+ "stage4"
50
+ ],
51
+ "torch_dtype": "float32",
52
+ "transformers_version": "4.42.4"
53
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a519fbb0a3e371645e7b2de81a25851fd1a5431d5b8f8b8b8a3500cf7009c5a8
3
+ size 94319344
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59c08134eddb0047e92fc278b4d07329a2990e13ce2b942eff1fc081f4a2f59c
3
+ size 188265274
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_pct": 0.875,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.485,
8
+ 0.456,
9
+ 0.406
10
+ ],
11
+ "image_processor_type": "ConvNextImageProcessor",
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "shortest_edge": 224
21
+ }
22
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f947d6b54eeedc56f985f58c76b5403af723450de37052dd484bc469768e1b
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d93b3b4529555907ae84565fa978f9e641a53e6e4a74c902aca536002ae407dc
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,2067 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9375,
3
+ "best_model_checkpoint": "./resnet50/checkpoint-2436",
4
+ "epoch": 30.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2520,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.11904761904761904,
13
+ "grad_norm": 0.48410382866859436,
14
+ "learning_rate": 7.936507936507937e-07,
15
+ "loss": 1.3866,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.23809523809523808,
20
+ "grad_norm": 0.4765987694263458,
21
+ "learning_rate": 1.5873015873015873e-06,
22
+ "loss": 1.3861,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.35714285714285715,
27
+ "grad_norm": 0.43654772639274597,
28
+ "learning_rate": 2.380952380952381e-06,
29
+ "loss": 1.3859,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.47619047619047616,
34
+ "grad_norm": 0.5733897686004639,
35
+ "learning_rate": 3.1746031746031746e-06,
36
+ "loss": 1.3855,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.5952380952380952,
41
+ "grad_norm": 0.457210898399353,
42
+ "learning_rate": 3.968253968253968e-06,
43
+ "loss": 1.385,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.7142857142857143,
48
+ "grad_norm": 0.5107349157333374,
49
+ "learning_rate": 4.761904761904762e-06,
50
+ "loss": 1.3843,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.8333333333333334,
55
+ "grad_norm": 0.6200265288352966,
56
+ "learning_rate": 5.555555555555557e-06,
57
+ "loss": 1.3832,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.9523809523809523,
62
+ "grad_norm": 0.6969241499900818,
63
+ "learning_rate": 6.349206349206349e-06,
64
+ "loss": 1.382,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 1.0,
69
+ "eval_accuracy": 0.58375,
70
+ "eval_loss": 1.3806676864624023,
71
+ "eval_runtime": 5.5764,
72
+ "eval_samples_per_second": 143.462,
73
+ "eval_steps_per_second": 17.933,
74
+ "step": 84
75
+ },
76
+ {
77
+ "epoch": 1.0714285714285714,
78
+ "grad_norm": 0.616682231426239,
79
+ "learning_rate": 7.1428571428571436e-06,
80
+ "loss": 1.3805,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 1.1904761904761905,
85
+ "grad_norm": 0.774887204170227,
86
+ "learning_rate": 7.936507936507936e-06,
87
+ "loss": 1.3786,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 1.3095238095238095,
92
+ "grad_norm": 0.862555205821991,
93
+ "learning_rate": 8.730158730158731e-06,
94
+ "loss": 1.3754,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 1.4285714285714286,
99
+ "grad_norm": 0.7880147695541382,
100
+ "learning_rate": 9.523809523809525e-06,
101
+ "loss": 1.3714,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 1.5476190476190477,
106
+ "grad_norm": 0.9495492577552795,
107
+ "learning_rate": 1.031746031746032e-05,
108
+ "loss": 1.3663,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 1.6666666666666665,
113
+ "grad_norm": 1.0720769166946411,
114
+ "learning_rate": 1.1111111111111113e-05,
115
+ "loss": 1.3595,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 1.7857142857142856,
120
+ "grad_norm": 1.4009815454483032,
121
+ "learning_rate": 1.1904761904761905e-05,
122
+ "loss": 1.3506,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 1.9047619047619047,
127
+ "grad_norm": 1.4374886751174927,
128
+ "learning_rate": 1.2698412698412699e-05,
129
+ "loss": 1.3401,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 2.0,
134
+ "eval_accuracy": 0.44875,
135
+ "eval_loss": 1.31219482421875,
136
+ "eval_runtime": 5.4479,
137
+ "eval_samples_per_second": 146.845,
138
+ "eval_steps_per_second": 18.356,
139
+ "step": 168
140
+ },
141
+ {
142
+ "epoch": 2.0238095238095237,
143
+ "grad_norm": 1.4388487339019775,
144
+ "learning_rate": 1.3492063492063494e-05,
145
+ "loss": 1.3256,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 2.142857142857143,
150
+ "grad_norm": 2.0602617263793945,
151
+ "learning_rate": 1.4285714285714287e-05,
152
+ "loss": 1.3085,
153
+ "step": 180
154
+ },
155
+ {
156
+ "epoch": 2.261904761904762,
157
+ "grad_norm": 2.5268349647521973,
158
+ "learning_rate": 1.507936507936508e-05,
159
+ "loss": 1.2787,
160
+ "step": 190
161
+ },
162
+ {
163
+ "epoch": 2.380952380952381,
164
+ "grad_norm": 2.7516028881073,
165
+ "learning_rate": 1.5873015873015872e-05,
166
+ "loss": 1.2603,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 2.5,
171
+ "grad_norm": 2.6878774166107178,
172
+ "learning_rate": 1.6666666666666667e-05,
173
+ "loss": 1.2355,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 2.619047619047619,
178
+ "grad_norm": 1.6872291564941406,
179
+ "learning_rate": 1.7460317460317463e-05,
180
+ "loss": 1.201,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 2.738095238095238,
185
+ "grad_norm": 1.6558163166046143,
186
+ "learning_rate": 1.8253968253968254e-05,
187
+ "loss": 1.18,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 2.857142857142857,
192
+ "grad_norm": 2.9151134490966797,
193
+ "learning_rate": 1.904761904761905e-05,
194
+ "loss": 1.1485,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 2.9761904761904763,
199
+ "grad_norm": 1.5142790079116821,
200
+ "learning_rate": 1.9841269841269845e-05,
201
+ "loss": 1.1534,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 3.0,
206
+ "eval_accuracy": 0.5,
207
+ "eval_loss": 1.121809482574463,
208
+ "eval_runtime": 5.3391,
209
+ "eval_samples_per_second": 149.838,
210
+ "eval_steps_per_second": 18.73,
211
+ "step": 252
212
+ },
213
+ {
214
+ "epoch": 3.0952380952380953,
215
+ "grad_norm": 1.396998643875122,
216
+ "learning_rate": 1.9938271604938272e-05,
217
+ "loss": 1.1074,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 3.2142857142857144,
222
+ "grad_norm": 1.8247098922729492,
223
+ "learning_rate": 1.9850088183421517e-05,
224
+ "loss": 1.1146,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 3.3333333333333335,
229
+ "grad_norm": 3.359285831451416,
230
+ "learning_rate": 1.9761904761904763e-05,
231
+ "loss": 1.0917,
232
+ "step": 280
233
+ },
234
+ {
235
+ "epoch": 3.4523809523809526,
236
+ "grad_norm": 2.1544318199157715,
237
+ "learning_rate": 1.9673721340388008e-05,
238
+ "loss": 1.0922,
239
+ "step": 290
240
+ },
241
+ {
242
+ "epoch": 3.571428571428571,
243
+ "grad_norm": 1.6087664365768433,
244
+ "learning_rate": 1.9585537918871253e-05,
245
+ "loss": 1.076,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 3.6904761904761907,
250
+ "grad_norm": 2.3240392208099365,
251
+ "learning_rate": 1.94973544973545e-05,
252
+ "loss": 1.0453,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 3.8095238095238093,
257
+ "grad_norm": 2.1408257484436035,
258
+ "learning_rate": 1.9409171075837744e-05,
259
+ "loss": 1.0366,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 3.928571428571429,
264
+ "grad_norm": 4.0489583015441895,
265
+ "learning_rate": 1.932098765432099e-05,
266
+ "loss": 1.0343,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 4.0,
271
+ "eval_accuracy": 0.67625,
272
+ "eval_loss": 1.0234503746032715,
273
+ "eval_runtime": 5.3529,
274
+ "eval_samples_per_second": 149.453,
275
+ "eval_steps_per_second": 18.682,
276
+ "step": 336
277
+ },
278
+ {
279
+ "epoch": 4.0476190476190474,
280
+ "grad_norm": 3.3416860103607178,
281
+ "learning_rate": 1.9232804232804235e-05,
282
+ "loss": 1.0142,
283
+ "step": 340
284
+ },
285
+ {
286
+ "epoch": 4.166666666666667,
287
+ "grad_norm": 2.0407114028930664,
288
+ "learning_rate": 1.914462081128748e-05,
289
+ "loss": 0.9883,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 4.285714285714286,
294
+ "grad_norm": 1.7006970643997192,
295
+ "learning_rate": 1.9056437389770726e-05,
296
+ "loss": 0.9741,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 4.404761904761905,
301
+ "grad_norm": 2.021171808242798,
302
+ "learning_rate": 1.8968253968253968e-05,
303
+ "loss": 0.9754,
304
+ "step": 370
305
+ },
306
+ {
307
+ "epoch": 4.523809523809524,
308
+ "grad_norm": 4.418124675750732,
309
+ "learning_rate": 1.8880070546737216e-05,
310
+ "loss": 0.9428,
311
+ "step": 380
312
+ },
313
+ {
314
+ "epoch": 4.642857142857143,
315
+ "grad_norm": 2.6121156215667725,
316
+ "learning_rate": 1.8791887125220462e-05,
317
+ "loss": 0.9401,
318
+ "step": 390
319
+ },
320
+ {
321
+ "epoch": 4.761904761904762,
322
+ "grad_norm": 2.1322250366210938,
323
+ "learning_rate": 1.8703703703703707e-05,
324
+ "loss": 0.9409,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 4.880952380952381,
329
+ "grad_norm": 1.571338415145874,
330
+ "learning_rate": 1.861552028218695e-05,
331
+ "loss": 0.9062,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 5.0,
336
+ "grad_norm": Infinity,
337
+ "learning_rate": 1.853615520282187e-05,
338
+ "loss": 0.9117,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 5.0,
343
+ "eval_accuracy": 0.7175,
344
+ "eval_loss": 0.9020006656646729,
345
+ "eval_runtime": 5.4379,
346
+ "eval_samples_per_second": 147.116,
347
+ "eval_steps_per_second": 18.389,
348
+ "step": 420
349
+ },
350
+ {
351
+ "epoch": 5.119047619047619,
352
+ "grad_norm": 4.1433868408203125,
353
+ "learning_rate": 1.8447971781305116e-05,
354
+ "loss": 0.8893,
355
+ "step": 430
356
+ },
357
+ {
358
+ "epoch": 5.238095238095238,
359
+ "grad_norm": 2.4193081855773926,
360
+ "learning_rate": 1.835978835978836e-05,
361
+ "loss": 0.8883,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 5.357142857142857,
366
+ "grad_norm": 3.0261456966400146,
367
+ "learning_rate": 1.8271604938271607e-05,
368
+ "loss": 0.8375,
369
+ "step": 450
370
+ },
371
+ {
372
+ "epoch": 5.476190476190476,
373
+ "grad_norm": 4.649360656738281,
374
+ "learning_rate": 1.8183421516754852e-05,
375
+ "loss": 0.8617,
376
+ "step": 460
377
+ },
378
+ {
379
+ "epoch": 5.595238095238095,
380
+ "grad_norm": 2.217705011367798,
381
+ "learning_rate": 1.8095238095238097e-05,
382
+ "loss": 0.8377,
383
+ "step": 470
384
+ },
385
+ {
386
+ "epoch": 5.714285714285714,
387
+ "grad_norm": 3.888078212738037,
388
+ "learning_rate": 1.8007054673721343e-05,
389
+ "loss": 0.8027,
390
+ "step": 480
391
+ },
392
+ {
393
+ "epoch": 5.833333333333333,
394
+ "grad_norm": 3.240481376647949,
395
+ "learning_rate": 1.7918871252204585e-05,
396
+ "loss": 0.8258,
397
+ "step": 490
398
+ },
399
+ {
400
+ "epoch": 5.9523809523809526,
401
+ "grad_norm": 2.535261631011963,
402
+ "learning_rate": 1.783068783068783e-05,
403
+ "loss": 0.8169,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 6.0,
408
+ "eval_accuracy": 0.7925,
409
+ "eval_loss": 0.8129910230636597,
410
+ "eval_runtime": 5.2312,
411
+ "eval_samples_per_second": 152.927,
412
+ "eval_steps_per_second": 19.116,
413
+ "step": 504
414
+ },
415
+ {
416
+ "epoch": 6.071428571428571,
417
+ "grad_norm": 3.822288751602173,
418
+ "learning_rate": 1.774250440917108e-05,
419
+ "loss": 0.7932,
420
+ "step": 510
421
+ },
422
+ {
423
+ "epoch": 6.190476190476191,
424
+ "grad_norm": 2.448084592819214,
425
+ "learning_rate": 1.7654320987654324e-05,
426
+ "loss": 0.7807,
427
+ "step": 520
428
+ },
429
+ {
430
+ "epoch": 6.309523809523809,
431
+ "grad_norm": 4.276307106018066,
432
+ "learning_rate": 1.7566137566137566e-05,
433
+ "loss": 0.7736,
434
+ "step": 530
435
+ },
436
+ {
437
+ "epoch": 6.428571428571429,
438
+ "grad_norm": 3.5429012775421143,
439
+ "learning_rate": 1.747795414462081e-05,
440
+ "loss": 0.7453,
441
+ "step": 540
442
+ },
443
+ {
444
+ "epoch": 6.5476190476190474,
445
+ "grad_norm": 2.4782090187072754,
446
+ "learning_rate": 1.7389770723104057e-05,
447
+ "loss": 0.7327,
448
+ "step": 550
449
+ },
450
+ {
451
+ "epoch": 6.666666666666667,
452
+ "grad_norm": 5.342073440551758,
453
+ "learning_rate": 1.7301587301587302e-05,
454
+ "loss": 0.7247,
455
+ "step": 560
456
+ },
457
+ {
458
+ "epoch": 6.785714285714286,
459
+ "grad_norm": 10.06567668914795,
460
+ "learning_rate": 1.7213403880070548e-05,
461
+ "loss": 0.7088,
462
+ "step": 570
463
+ },
464
+ {
465
+ "epoch": 6.904761904761905,
466
+ "grad_norm": 3.217452049255371,
467
+ "learning_rate": 1.713403880070547e-05,
468
+ "loss": 0.7058,
469
+ "step": 580
470
+ },
471
+ {
472
+ "epoch": 7.0,
473
+ "eval_accuracy": 0.815,
474
+ "eval_loss": 0.7232482433319092,
475
+ "eval_runtime": 5.4248,
476
+ "eval_samples_per_second": 147.471,
477
+ "eval_steps_per_second": 18.434,
478
+ "step": 588
479
+ },
480
+ {
481
+ "epoch": 7.023809523809524,
482
+ "grad_norm": 4.111032009124756,
483
+ "learning_rate": 1.7045855379188714e-05,
484
+ "loss": 0.6987,
485
+ "step": 590
486
+ },
487
+ {
488
+ "epoch": 7.142857142857143,
489
+ "grad_norm": 2.667541027069092,
490
+ "learning_rate": 1.695767195767196e-05,
491
+ "loss": 0.684,
492
+ "step": 600
493
+ },
494
+ {
495
+ "epoch": 7.261904761904762,
496
+ "grad_norm": 8.819822311401367,
497
+ "learning_rate": 1.6869488536155205e-05,
498
+ "loss": 0.674,
499
+ "step": 610
500
+ },
501
+ {
502
+ "epoch": 7.380952380952381,
503
+ "grad_norm": 3.9701991081237793,
504
+ "learning_rate": 1.6781305114638447e-05,
505
+ "loss": 0.6199,
506
+ "step": 620
507
+ },
508
+ {
509
+ "epoch": 7.5,
510
+ "grad_norm": 8.835700035095215,
511
+ "learning_rate": 1.6693121693121696e-05,
512
+ "loss": 0.6241,
513
+ "step": 630
514
+ },
515
+ {
516
+ "epoch": 7.619047619047619,
517
+ "grad_norm": 3.7322373390197754,
518
+ "learning_rate": 1.660493827160494e-05,
519
+ "loss": 0.6087,
520
+ "step": 640
521
+ },
522
+ {
523
+ "epoch": 7.738095238095238,
524
+ "grad_norm": 4.442748069763184,
525
+ "learning_rate": 1.6516754850088187e-05,
526
+ "loss": 0.589,
527
+ "step": 650
528
+ },
529
+ {
530
+ "epoch": 7.857142857142857,
531
+ "grad_norm": 3.6890928745269775,
532
+ "learning_rate": 1.642857142857143e-05,
533
+ "loss": 0.5804,
534
+ "step": 660
535
+ },
536
+ {
537
+ "epoch": 7.976190476190476,
538
+ "grad_norm": 7.095566749572754,
539
+ "learning_rate": 1.6340388007054674e-05,
540
+ "loss": 0.5556,
541
+ "step": 670
542
+ },
543
+ {
544
+ "epoch": 8.0,
545
+ "eval_accuracy": 0.83375,
546
+ "eval_loss": 0.58147794008255,
547
+ "eval_runtime": 6.0471,
548
+ "eval_samples_per_second": 132.295,
549
+ "eval_steps_per_second": 16.537,
550
+ "step": 672
551
+ },
552
+ {
553
+ "epoch": 8.095238095238095,
554
+ "grad_norm": 4.2159318923950195,
555
+ "learning_rate": 1.625220458553792e-05,
556
+ "loss": 0.577,
557
+ "step": 680
558
+ },
559
+ {
560
+ "epoch": 8.214285714285714,
561
+ "grad_norm": 3.634282350540161,
562
+ "learning_rate": 1.6164021164021168e-05,
563
+ "loss": 0.5226,
564
+ "step": 690
565
+ },
566
+ {
567
+ "epoch": 8.333333333333334,
568
+ "grad_norm": 3.7204463481903076,
569
+ "learning_rate": 1.607583774250441e-05,
570
+ "loss": 0.5397,
571
+ "step": 700
572
+ },
573
+ {
574
+ "epoch": 8.452380952380953,
575
+ "grad_norm": 8.489058494567871,
576
+ "learning_rate": 1.5987654320987655e-05,
577
+ "loss": 0.515,
578
+ "step": 710
579
+ },
580
+ {
581
+ "epoch": 8.571428571428571,
582
+ "grad_norm": 8.01997184753418,
583
+ "learning_rate": 1.58994708994709e-05,
584
+ "loss": 0.5017,
585
+ "step": 720
586
+ },
587
+ {
588
+ "epoch": 8.69047619047619,
589
+ "grad_norm": 5.835849761962891,
590
+ "learning_rate": 1.5811287477954146e-05,
591
+ "loss": 0.5008,
592
+ "step": 730
593
+ },
594
+ {
595
+ "epoch": 8.80952380952381,
596
+ "grad_norm": 10.542766571044922,
597
+ "learning_rate": 1.572310405643739e-05,
598
+ "loss": 0.476,
599
+ "step": 740
600
+ },
601
+ {
602
+ "epoch": 8.928571428571429,
603
+ "grad_norm": 7.684937953948975,
604
+ "learning_rate": 1.5634920634920637e-05,
605
+ "loss": 0.4527,
606
+ "step": 750
607
+ },
608
+ {
609
+ "epoch": 9.0,
610
+ "eval_accuracy": 0.86625,
611
+ "eval_loss": 0.4813559651374817,
612
+ "eval_runtime": 6.5617,
613
+ "eval_samples_per_second": 121.92,
614
+ "eval_steps_per_second": 15.24,
615
+ "step": 756
616
+ },
617
+ {
618
+ "epoch": 9.047619047619047,
619
+ "grad_norm": 6.5577569007873535,
620
+ "learning_rate": 1.5546737213403882e-05,
621
+ "loss": 0.4875,
622
+ "step": 760
623
+ },
624
+ {
625
+ "epoch": 9.166666666666666,
626
+ "grad_norm": 6.667921543121338,
627
+ "learning_rate": 1.5458553791887128e-05,
628
+ "loss": 0.4581,
629
+ "step": 770
630
+ },
631
+ {
632
+ "epoch": 9.285714285714286,
633
+ "grad_norm": 5.33371114730835,
634
+ "learning_rate": 1.537037037037037e-05,
635
+ "loss": 0.4269,
636
+ "step": 780
637
+ },
638
+ {
639
+ "epoch": 9.404761904761905,
640
+ "grad_norm": 4.12121057510376,
641
+ "learning_rate": 1.5282186948853618e-05,
642
+ "loss": 0.4173,
643
+ "step": 790
644
+ },
645
+ {
646
+ "epoch": 9.523809523809524,
647
+ "grad_norm": 7.042411804199219,
648
+ "learning_rate": 1.5194003527336862e-05,
649
+ "loss": 0.4312,
650
+ "step": 800
651
+ },
652
+ {
653
+ "epoch": 9.642857142857142,
654
+ "grad_norm": 6.229002952575684,
655
+ "learning_rate": 1.5105820105820109e-05,
656
+ "loss": 0.421,
657
+ "step": 810
658
+ },
659
+ {
660
+ "epoch": 9.761904761904763,
661
+ "grad_norm": 9.01046085357666,
662
+ "learning_rate": 1.5017636684303351e-05,
663
+ "loss": 0.4182,
664
+ "step": 820
665
+ },
666
+ {
667
+ "epoch": 9.880952380952381,
668
+ "grad_norm": 9.333113670349121,
669
+ "learning_rate": 1.4929453262786598e-05,
670
+ "loss": 0.4065,
671
+ "step": 830
672
+ },
673
+ {
674
+ "epoch": 10.0,
675
+ "grad_norm": 5.7903876304626465,
676
+ "learning_rate": 1.4841269841269843e-05,
677
+ "loss": 0.3994,
678
+ "step": 840
679
+ },
680
+ {
681
+ "epoch": 10.0,
682
+ "eval_accuracy": 0.86875,
683
+ "eval_loss": 0.43535658717155457,
684
+ "eval_runtime": 6.424,
685
+ "eval_samples_per_second": 124.534,
686
+ "eval_steps_per_second": 15.567,
687
+ "step": 840
688
+ },
689
+ {
690
+ "epoch": 10.119047619047619,
691
+ "grad_norm": 4.672136306762695,
692
+ "learning_rate": 1.4753086419753087e-05,
693
+ "loss": 0.3692,
694
+ "step": 850
695
+ },
696
+ {
697
+ "epoch": 10.238095238095237,
698
+ "grad_norm": 9.825118064880371,
699
+ "learning_rate": 1.4664902998236332e-05,
700
+ "loss": 0.3946,
701
+ "step": 860
702
+ },
703
+ {
704
+ "epoch": 10.357142857142858,
705
+ "grad_norm": 10.594053268432617,
706
+ "learning_rate": 1.4576719576719578e-05,
707
+ "loss": 0.3836,
708
+ "step": 870
709
+ },
710
+ {
711
+ "epoch": 10.476190476190476,
712
+ "grad_norm": 6.456384658813477,
713
+ "learning_rate": 1.4488536155202823e-05,
714
+ "loss": 0.3922,
715
+ "step": 880
716
+ },
717
+ {
718
+ "epoch": 10.595238095238095,
719
+ "grad_norm": 7.883987903594971,
720
+ "learning_rate": 1.4400352733686067e-05,
721
+ "loss": 0.3767,
722
+ "step": 890
723
+ },
724
+ {
725
+ "epoch": 10.714285714285714,
726
+ "grad_norm": 3.752121686935425,
727
+ "learning_rate": 1.4312169312169312e-05,
728
+ "loss": 0.3731,
729
+ "step": 900
730
+ },
731
+ {
732
+ "epoch": 10.833333333333334,
733
+ "grad_norm": 5.878788948059082,
734
+ "learning_rate": 1.422398589065256e-05,
735
+ "loss": 0.3581,
736
+ "step": 910
737
+ },
738
+ {
739
+ "epoch": 10.952380952380953,
740
+ "grad_norm": 10.106928825378418,
741
+ "learning_rate": 1.4135802469135805e-05,
742
+ "loss": 0.3399,
743
+ "step": 920
744
+ },
745
+ {
746
+ "epoch": 11.0,
747
+ "eval_accuracy": 0.89,
748
+ "eval_loss": 0.3747052252292633,
749
+ "eval_runtime": 6.1766,
750
+ "eval_samples_per_second": 129.52,
751
+ "eval_steps_per_second": 16.19,
752
+ "step": 924
753
+ },
754
+ {
755
+ "epoch": 11.071428571428571,
756
+ "grad_norm": 13.221107482910156,
757
+ "learning_rate": 1.4047619047619048e-05,
758
+ "loss": 0.3512,
759
+ "step": 930
760
+ },
761
+ {
762
+ "epoch": 11.19047619047619,
763
+ "grad_norm": 5.013436317443848,
764
+ "learning_rate": 1.3959435626102294e-05,
765
+ "loss": 0.3217,
766
+ "step": 940
767
+ },
768
+ {
769
+ "epoch": 11.30952380952381,
770
+ "grad_norm": 4.782650470733643,
771
+ "learning_rate": 1.3871252204585539e-05,
772
+ "loss": 0.3603,
773
+ "step": 950
774
+ },
775
+ {
776
+ "epoch": 11.428571428571429,
777
+ "grad_norm": 7.278000354766846,
778
+ "learning_rate": 1.3783068783068784e-05,
779
+ "loss": 0.3193,
780
+ "step": 960
781
+ },
782
+ {
783
+ "epoch": 11.547619047619047,
784
+ "grad_norm": 13.20319652557373,
785
+ "learning_rate": 1.3694885361552028e-05,
786
+ "loss": 0.3429,
787
+ "step": 970
788
+ },
789
+ {
790
+ "epoch": 11.666666666666666,
791
+ "grad_norm": 10.208937644958496,
792
+ "learning_rate": 1.3606701940035273e-05,
793
+ "loss": 0.3391,
794
+ "step": 980
795
+ },
796
+ {
797
+ "epoch": 11.785714285714286,
798
+ "grad_norm": 8.55504322052002,
799
+ "learning_rate": 1.351851851851852e-05,
800
+ "loss": 0.3286,
801
+ "step": 990
802
+ },
803
+ {
804
+ "epoch": 11.904761904761905,
805
+ "grad_norm": 9.174301147460938,
806
+ "learning_rate": 1.3430335097001766e-05,
807
+ "loss": 0.3157,
808
+ "step": 1000
809
+ },
810
+ {
811
+ "epoch": 12.0,
812
+ "eval_accuracy": 0.895,
813
+ "eval_loss": 0.33855053782463074,
814
+ "eval_runtime": 5.5685,
815
+ "eval_samples_per_second": 143.664,
816
+ "eval_steps_per_second": 17.958,
817
+ "step": 1008
818
+ },
819
+ {
820
+ "epoch": 12.023809523809524,
821
+ "grad_norm": 9.196775436401367,
822
+ "learning_rate": 1.334215167548501e-05,
823
+ "loss": 0.32,
824
+ "step": 1010
825
+ },
826
+ {
827
+ "epoch": 12.142857142857142,
828
+ "grad_norm": 7.491143703460693,
829
+ "learning_rate": 1.3253968253968255e-05,
830
+ "loss": 0.3211,
831
+ "step": 1020
832
+ },
833
+ {
834
+ "epoch": 12.261904761904763,
835
+ "grad_norm": 10.411942481994629,
836
+ "learning_rate": 1.31657848324515e-05,
837
+ "loss": 0.31,
838
+ "step": 1030
839
+ },
840
+ {
841
+ "epoch": 12.380952380952381,
842
+ "grad_norm": 7.7979021072387695,
843
+ "learning_rate": 1.3077601410934746e-05,
844
+ "loss": 0.301,
845
+ "step": 1040
846
+ },
847
+ {
848
+ "epoch": 12.5,
849
+ "grad_norm": 16.540781021118164,
850
+ "learning_rate": 1.298941798941799e-05,
851
+ "loss": 0.2879,
852
+ "step": 1050
853
+ },
854
+ {
855
+ "epoch": 12.619047619047619,
856
+ "grad_norm": 8.34303092956543,
857
+ "learning_rate": 1.2901234567901235e-05,
858
+ "loss": 0.2837,
859
+ "step": 1060
860
+ },
861
+ {
862
+ "epoch": 12.738095238095237,
863
+ "grad_norm": 5.1471405029296875,
864
+ "learning_rate": 1.2813051146384482e-05,
865
+ "loss": 0.2943,
866
+ "step": 1070
867
+ },
868
+ {
869
+ "epoch": 12.857142857142858,
870
+ "grad_norm": 12.514649391174316,
871
+ "learning_rate": 1.2724867724867727e-05,
872
+ "loss": 0.3033,
873
+ "step": 1080
874
+ },
875
+ {
876
+ "epoch": 12.976190476190476,
877
+ "grad_norm": 14.337882041931152,
878
+ "learning_rate": 1.263668430335097e-05,
879
+ "loss": 0.3094,
880
+ "step": 1090
881
+ },
882
+ {
883
+ "epoch": 13.0,
884
+ "eval_accuracy": 0.90375,
885
+ "eval_loss": 0.3166072368621826,
886
+ "eval_runtime": 5.6029,
887
+ "eval_samples_per_second": 142.783,
888
+ "eval_steps_per_second": 17.848,
889
+ "step": 1092
890
+ },
891
+ {
892
+ "epoch": 13.095238095238095,
893
+ "grad_norm": 7.664865493774414,
894
+ "learning_rate": 1.2548500881834216e-05,
895
+ "loss": 0.3027,
896
+ "step": 1100
897
+ },
898
+ {
899
+ "epoch": 13.214285714285714,
900
+ "grad_norm": 7.5357489585876465,
901
+ "learning_rate": 1.2460317460317461e-05,
902
+ "loss": 0.303,
903
+ "step": 1110
904
+ },
905
+ {
906
+ "epoch": 13.333333333333334,
907
+ "grad_norm": 10.726656913757324,
908
+ "learning_rate": 1.2372134038800707e-05,
909
+ "loss": 0.2897,
910
+ "step": 1120
911
+ },
912
+ {
913
+ "epoch": 13.452380952380953,
914
+ "grad_norm": 12.0068359375,
915
+ "learning_rate": 1.228395061728395e-05,
916
+ "loss": 0.2949,
917
+ "step": 1130
918
+ },
919
+ {
920
+ "epoch": 13.571428571428571,
921
+ "grad_norm": 9.883105278015137,
922
+ "learning_rate": 1.2195767195767196e-05,
923
+ "loss": 0.2573,
924
+ "step": 1140
925
+ },
926
+ {
927
+ "epoch": 13.69047619047619,
928
+ "grad_norm": 9.254776000976562,
929
+ "learning_rate": 1.2107583774250443e-05,
930
+ "loss": 0.2838,
931
+ "step": 1150
932
+ },
933
+ {
934
+ "epoch": 13.80952380952381,
935
+ "grad_norm": 6.0406880378723145,
936
+ "learning_rate": 1.2019400352733688e-05,
937
+ "loss": 0.2558,
938
+ "step": 1160
939
+ },
940
+ {
941
+ "epoch": 13.928571428571429,
942
+ "grad_norm": 6.664582252502441,
943
+ "learning_rate": 1.1931216931216932e-05,
944
+ "loss": 0.2839,
945
+ "step": 1170
946
+ },
947
+ {
948
+ "epoch": 14.0,
949
+ "eval_accuracy": 0.90125,
950
+ "eval_loss": 0.31680428981781006,
951
+ "eval_runtime": 5.4208,
952
+ "eval_samples_per_second": 147.579,
953
+ "eval_steps_per_second": 18.447,
954
+ "step": 1176
955
+ },
956
+ {
957
+ "epoch": 14.047619047619047,
958
+ "grad_norm": 11.964251518249512,
959
+ "learning_rate": 1.1843033509700177e-05,
960
+ "loss": 0.2688,
961
+ "step": 1180
962
+ },
963
+ {
964
+ "epoch": 14.166666666666666,
965
+ "grad_norm": 6.548354148864746,
966
+ "learning_rate": 1.1754850088183423e-05,
967
+ "loss": 0.2545,
968
+ "step": 1190
969
+ },
970
+ {
971
+ "epoch": 14.285714285714286,
972
+ "grad_norm": 14.916826248168945,
973
+ "learning_rate": 1.1666666666666668e-05,
974
+ "loss": 0.2618,
975
+ "step": 1200
976
+ },
977
+ {
978
+ "epoch": 14.404761904761905,
979
+ "grad_norm": 15.267316818237305,
980
+ "learning_rate": 1.1578483245149912e-05,
981
+ "loss": 0.2442,
982
+ "step": 1210
983
+ },
984
+ {
985
+ "epoch": 14.523809523809524,
986
+ "grad_norm": 8.463184356689453,
987
+ "learning_rate": 1.1490299823633157e-05,
988
+ "loss": 0.2618,
989
+ "step": 1220
990
+ },
991
+ {
992
+ "epoch": 14.642857142857142,
993
+ "grad_norm": 13.68342113494873,
994
+ "learning_rate": 1.1402116402116404e-05,
995
+ "loss": 0.2447,
996
+ "step": 1230
997
+ },
998
+ {
999
+ "epoch": 14.761904761904763,
1000
+ "grad_norm": 4.8457112312316895,
1001
+ "learning_rate": 1.131393298059965e-05,
1002
+ "loss": 0.2361,
1003
+ "step": 1240
1004
+ },
1005
+ {
1006
+ "epoch": 14.880952380952381,
1007
+ "grad_norm": 6.4722185134887695,
1008
+ "learning_rate": 1.1225749559082893e-05,
1009
+ "loss": 0.2519,
1010
+ "step": 1250
1011
+ },
1012
+ {
1013
+ "epoch": 15.0,
1014
+ "grad_norm": 15.844517707824707,
1015
+ "learning_rate": 1.1137566137566138e-05,
1016
+ "loss": 0.2658,
1017
+ "step": 1260
1018
+ },
1019
+ {
1020
+ "epoch": 15.0,
1021
+ "eval_accuracy": 0.91,
1022
+ "eval_loss": 0.2803078591823578,
1023
+ "eval_runtime": 5.3029,
1024
+ "eval_samples_per_second": 150.862,
1025
+ "eval_steps_per_second": 18.858,
1026
+ "step": 1260
1027
+ },
1028
+ {
1029
+ "epoch": 15.119047619047619,
1030
+ "grad_norm": 20.211238861083984,
1031
+ "learning_rate": 1.1049382716049384e-05,
1032
+ "loss": 0.2386,
1033
+ "step": 1270
1034
+ },
1035
+ {
1036
+ "epoch": 15.238095238095237,
1037
+ "grad_norm": 6.853188991546631,
1038
+ "learning_rate": 1.0961199294532629e-05,
1039
+ "loss": 0.2326,
1040
+ "step": 1280
1041
+ },
1042
+ {
1043
+ "epoch": 15.357142857142858,
1044
+ "grad_norm": 8.602685928344727,
1045
+ "learning_rate": 1.0873015873015873e-05,
1046
+ "loss": 0.2553,
1047
+ "step": 1290
1048
+ },
1049
+ {
1050
+ "epoch": 15.476190476190476,
1051
+ "grad_norm": 4.291443824768066,
1052
+ "learning_rate": 1.0784832451499118e-05,
1053
+ "loss": 0.2434,
1054
+ "step": 1300
1055
+ },
1056
+ {
1057
+ "epoch": 15.595238095238095,
1058
+ "grad_norm": 6.306275367736816,
1059
+ "learning_rate": 1.0696649029982365e-05,
1060
+ "loss": 0.2277,
1061
+ "step": 1310
1062
+ },
1063
+ {
1064
+ "epoch": 15.714285714285714,
1065
+ "grad_norm": 5.804137706756592,
1066
+ "learning_rate": 1.060846560846561e-05,
1067
+ "loss": 0.2248,
1068
+ "step": 1320
1069
+ },
1070
+ {
1071
+ "epoch": 15.833333333333334,
1072
+ "grad_norm": 13.754738807678223,
1073
+ "learning_rate": 1.0520282186948854e-05,
1074
+ "loss": 0.2313,
1075
+ "step": 1330
1076
+ },
1077
+ {
1078
+ "epoch": 15.952380952380953,
1079
+ "grad_norm": 8.410860061645508,
1080
+ "learning_rate": 1.04320987654321e-05,
1081
+ "loss": 0.2331,
1082
+ "step": 1340
1083
+ },
1084
+ {
1085
+ "epoch": 16.0,
1086
+ "eval_accuracy": 0.9125,
1087
+ "eval_loss": 0.26975059509277344,
1088
+ "eval_runtime": 5.4078,
1089
+ "eval_samples_per_second": 147.934,
1090
+ "eval_steps_per_second": 18.492,
1091
+ "step": 1344
1092
+ },
1093
+ {
1094
+ "epoch": 16.071428571428573,
1095
+ "grad_norm": 9.222555160522461,
1096
+ "learning_rate": 1.0352733686067021e-05,
1097
+ "loss": 0.2394,
1098
+ "step": 1350
1099
+ },
1100
+ {
1101
+ "epoch": 16.19047619047619,
1102
+ "grad_norm": 9.493171691894531,
1103
+ "learning_rate": 1.0264550264550266e-05,
1104
+ "loss": 0.2213,
1105
+ "step": 1360
1106
+ },
1107
+ {
1108
+ "epoch": 16.30952380952381,
1109
+ "grad_norm": 4.863091945648193,
1110
+ "learning_rate": 1.0176366843033512e-05,
1111
+ "loss": 0.2447,
1112
+ "step": 1370
1113
+ },
1114
+ {
1115
+ "epoch": 16.428571428571427,
1116
+ "grad_norm": 9.915203094482422,
1117
+ "learning_rate": 1.0088183421516755e-05,
1118
+ "loss": 0.2369,
1119
+ "step": 1380
1120
+ },
1121
+ {
1122
+ "epoch": 16.547619047619047,
1123
+ "grad_norm": 7.485960960388184,
1124
+ "learning_rate": 1e-05,
1125
+ "loss": 0.213,
1126
+ "step": 1390
1127
+ },
1128
+ {
1129
+ "epoch": 16.666666666666668,
1130
+ "grad_norm": 5.043095111846924,
1131
+ "learning_rate": 9.911816578483246e-06,
1132
+ "loss": 0.2035,
1133
+ "step": 1400
1134
+ },
1135
+ {
1136
+ "epoch": 16.785714285714285,
1137
+ "grad_norm": 12.636078834533691,
1138
+ "learning_rate": 9.823633156966492e-06,
1139
+ "loss": 0.2312,
1140
+ "step": 1410
1141
+ },
1142
+ {
1143
+ "epoch": 16.904761904761905,
1144
+ "grad_norm": 13.119421005249023,
1145
+ "learning_rate": 9.735449735449735e-06,
1146
+ "loss": 0.2271,
1147
+ "step": 1420
1148
+ },
1149
+ {
1150
+ "epoch": 17.0,
1151
+ "eval_accuracy": 0.90625,
1152
+ "eval_loss": 0.2662568986415863,
1153
+ "eval_runtime": 5.5739,
1154
+ "eval_samples_per_second": 143.526,
1155
+ "eval_steps_per_second": 17.941,
1156
+ "step": 1428
1157
+ },
1158
+ {
1159
+ "epoch": 17.023809523809526,
1160
+ "grad_norm": 19.145736694335938,
1161
+ "learning_rate": 9.64726631393298e-06,
1162
+ "loss": 0.2089,
1163
+ "step": 1430
1164
+ },
1165
+ {
1166
+ "epoch": 17.142857142857142,
1167
+ "grad_norm": 13.143487930297852,
1168
+ "learning_rate": 9.559082892416226e-06,
1169
+ "loss": 0.2036,
1170
+ "step": 1440
1171
+ },
1172
+ {
1173
+ "epoch": 17.261904761904763,
1174
+ "grad_norm": 13.245809555053711,
1175
+ "learning_rate": 9.470899470899471e-06,
1176
+ "loss": 0.1902,
1177
+ "step": 1450
1178
+ },
1179
+ {
1180
+ "epoch": 17.38095238095238,
1181
+ "grad_norm": 15.815811157226562,
1182
+ "learning_rate": 9.382716049382717e-06,
1183
+ "loss": 0.2234,
1184
+ "step": 1460
1185
+ },
1186
+ {
1187
+ "epoch": 17.5,
1188
+ "grad_norm": 8.950321197509766,
1189
+ "learning_rate": 9.294532627865962e-06,
1190
+ "loss": 0.2258,
1191
+ "step": 1470
1192
+ },
1193
+ {
1194
+ "epoch": 17.61904761904762,
1195
+ "grad_norm": 6.937575817108154,
1196
+ "learning_rate": 9.206349206349207e-06,
1197
+ "loss": 0.212,
1198
+ "step": 1480
1199
+ },
1200
+ {
1201
+ "epoch": 17.738095238095237,
1202
+ "grad_norm": 9.044116020202637,
1203
+ "learning_rate": 9.118165784832453e-06,
1204
+ "loss": 0.2084,
1205
+ "step": 1490
1206
+ },
1207
+ {
1208
+ "epoch": 17.857142857142858,
1209
+ "grad_norm": 9.224007606506348,
1210
+ "learning_rate": 9.029982363315696e-06,
1211
+ "loss": 0.2036,
1212
+ "step": 1500
1213
+ },
1214
+ {
1215
+ "epoch": 17.976190476190474,
1216
+ "grad_norm": 5.575143814086914,
1217
+ "learning_rate": 8.941798941798942e-06,
1218
+ "loss": 0.1895,
1219
+ "step": 1510
1220
+ },
1221
+ {
1222
+ "epoch": 18.0,
1223
+ "eval_accuracy": 0.92375,
1224
+ "eval_loss": 0.2639971375465393,
1225
+ "eval_runtime": 5.9356,
1226
+ "eval_samples_per_second": 134.779,
1227
+ "eval_steps_per_second": 16.847,
1228
+ "step": 1512
1229
+ },
1230
+ {
1231
+ "epoch": 18.095238095238095,
1232
+ "grad_norm": 10.496304512023926,
1233
+ "learning_rate": 8.853615520282187e-06,
1234
+ "loss": 0.1908,
1235
+ "step": 1520
1236
+ },
1237
+ {
1238
+ "epoch": 18.214285714285715,
1239
+ "grad_norm": 8.395342826843262,
1240
+ "learning_rate": 8.765432098765432e-06,
1241
+ "loss": 0.202,
1242
+ "step": 1530
1243
+ },
1244
+ {
1245
+ "epoch": 18.333333333333332,
1246
+ "grad_norm": 6.879480838775635,
1247
+ "learning_rate": 8.677248677248678e-06,
1248
+ "loss": 0.1825,
1249
+ "step": 1540
1250
+ },
1251
+ {
1252
+ "epoch": 18.452380952380953,
1253
+ "grad_norm": 6.519415855407715,
1254
+ "learning_rate": 8.589065255731923e-06,
1255
+ "loss": 0.2025,
1256
+ "step": 1550
1257
+ },
1258
+ {
1259
+ "epoch": 18.571428571428573,
1260
+ "grad_norm": 14.108360290527344,
1261
+ "learning_rate": 8.500881834215169e-06,
1262
+ "loss": 0.2156,
1263
+ "step": 1560
1264
+ },
1265
+ {
1266
+ "epoch": 18.69047619047619,
1267
+ "grad_norm": 8.98913288116455,
1268
+ "learning_rate": 8.412698412698414e-06,
1269
+ "loss": 0.1895,
1270
+ "step": 1570
1271
+ },
1272
+ {
1273
+ "epoch": 18.80952380952381,
1274
+ "grad_norm": 7.803181171417236,
1275
+ "learning_rate": 8.324514991181658e-06,
1276
+ "loss": 0.1829,
1277
+ "step": 1580
1278
+ },
1279
+ {
1280
+ "epoch": 18.928571428571427,
1281
+ "grad_norm": 6.913029193878174,
1282
+ "learning_rate": 8.236331569664903e-06,
1283
+ "loss": 0.1914,
1284
+ "step": 1590
1285
+ },
1286
+ {
1287
+ "epoch": 19.0,
1288
+ "eval_accuracy": 0.9225,
1289
+ "eval_loss": 0.23814032971858978,
1290
+ "eval_runtime": 6.1225,
1291
+ "eval_samples_per_second": 130.665,
1292
+ "eval_steps_per_second": 16.333,
1293
+ "step": 1596
1294
+ },
1295
+ {
1296
+ "epoch": 19.047619047619047,
1297
+ "grad_norm": 8.117836952209473,
1298
+ "learning_rate": 8.148148148148148e-06,
1299
+ "loss": 0.1953,
1300
+ "step": 1600
1301
+ },
1302
+ {
1303
+ "epoch": 19.166666666666668,
1304
+ "grad_norm": 7.4035139083862305,
1305
+ "learning_rate": 8.059964726631394e-06,
1306
+ "loss": 0.1755,
1307
+ "step": 1610
1308
+ },
1309
+ {
1310
+ "epoch": 19.285714285714285,
1311
+ "grad_norm": 9.961386680603027,
1312
+ "learning_rate": 7.971781305114639e-06,
1313
+ "loss": 0.1826,
1314
+ "step": 1620
1315
+ },
1316
+ {
1317
+ "epoch": 19.404761904761905,
1318
+ "grad_norm": 3.902977705001831,
1319
+ "learning_rate": 7.883597883597884e-06,
1320
+ "loss": 0.2038,
1321
+ "step": 1630
1322
+ },
1323
+ {
1324
+ "epoch": 19.523809523809526,
1325
+ "grad_norm": 11.7597074508667,
1326
+ "learning_rate": 7.79541446208113e-06,
1327
+ "loss": 0.1755,
1328
+ "step": 1640
1329
+ },
1330
+ {
1331
+ "epoch": 19.642857142857142,
1332
+ "grad_norm": 5.295619964599609,
1333
+ "learning_rate": 7.707231040564375e-06,
1334
+ "loss": 0.1821,
1335
+ "step": 1650
1336
+ },
1337
+ {
1338
+ "epoch": 19.761904761904763,
1339
+ "grad_norm": 8.691162109375,
1340
+ "learning_rate": 7.61904761904762e-06,
1341
+ "loss": 0.1931,
1342
+ "step": 1660
1343
+ },
1344
+ {
1345
+ "epoch": 19.88095238095238,
1346
+ "grad_norm": 14.22559928894043,
1347
+ "learning_rate": 7.530864197530865e-06,
1348
+ "loss": 0.1842,
1349
+ "step": 1670
1350
+ },
1351
+ {
1352
+ "epoch": 20.0,
1353
+ "grad_norm": 19.08734130859375,
1354
+ "learning_rate": 7.4426807760141095e-06,
1355
+ "loss": 0.1741,
1356
+ "step": 1680
1357
+ },
1358
+ {
1359
+ "epoch": 20.0,
1360
+ "eval_accuracy": 0.92125,
1361
+ "eval_loss": 0.2286679595708847,
1362
+ "eval_runtime": 6.3863,
1363
+ "eval_samples_per_second": 125.269,
1364
+ "eval_steps_per_second": 15.659,
1365
+ "step": 1680
1366
+ },
1367
+ {
1368
+ "epoch": 20.11904761904762,
1369
+ "grad_norm": 11.843003273010254,
1370
+ "learning_rate": 7.354497354497355e-06,
1371
+ "loss": 0.1704,
1372
+ "step": 1690
1373
+ },
1374
+ {
1375
+ "epoch": 20.238095238095237,
1376
+ "grad_norm": 4.4936299324035645,
1377
+ "learning_rate": 7.2663139329806e-06,
1378
+ "loss": 0.1833,
1379
+ "step": 1700
1380
+ },
1381
+ {
1382
+ "epoch": 20.357142857142858,
1383
+ "grad_norm": 3.9449779987335205,
1384
+ "learning_rate": 7.178130511463846e-06,
1385
+ "loss": 0.1779,
1386
+ "step": 1710
1387
+ },
1388
+ {
1389
+ "epoch": 20.476190476190474,
1390
+ "grad_norm": 9.936880111694336,
1391
+ "learning_rate": 7.08994708994709e-06,
1392
+ "loss": 0.1789,
1393
+ "step": 1720
1394
+ },
1395
+ {
1396
+ "epoch": 20.595238095238095,
1397
+ "grad_norm": 5.483986854553223,
1398
+ "learning_rate": 7.0017636684303355e-06,
1399
+ "loss": 0.1776,
1400
+ "step": 1730
1401
+ },
1402
+ {
1403
+ "epoch": 20.714285714285715,
1404
+ "grad_norm": 8.04780101776123,
1405
+ "learning_rate": 6.913580246913581e-06,
1406
+ "loss": 0.1737,
1407
+ "step": 1740
1408
+ },
1409
+ {
1410
+ "epoch": 20.833333333333332,
1411
+ "grad_norm": 6.709885120391846,
1412
+ "learning_rate": 6.825396825396826e-06,
1413
+ "loss": 0.1855,
1414
+ "step": 1750
1415
+ },
1416
+ {
1417
+ "epoch": 20.952380952380953,
1418
+ "grad_norm": 7.882541656494141,
1419
+ "learning_rate": 6.737213403880071e-06,
1420
+ "loss": 0.1682,
1421
+ "step": 1760
1422
+ },
1423
+ {
1424
+ "epoch": 21.0,
1425
+ "eval_accuracy": 0.92625,
1426
+ "eval_loss": 0.21940070390701294,
1427
+ "eval_runtime": 5.4374,
1428
+ "eval_samples_per_second": 147.128,
1429
+ "eval_steps_per_second": 18.391,
1430
+ "step": 1764
1431
+ },
1432
+ {
1433
+ "epoch": 21.071428571428573,
1434
+ "grad_norm": 5.953088760375977,
1435
+ "learning_rate": 6.649029982363316e-06,
1436
+ "loss": 0.1741,
1437
+ "step": 1770
1438
+ },
1439
+ {
1440
+ "epoch": 21.19047619047619,
1441
+ "grad_norm": 7.106684684753418,
1442
+ "learning_rate": 6.560846560846561e-06,
1443
+ "loss": 0.1745,
1444
+ "step": 1780
1445
+ },
1446
+ {
1447
+ "epoch": 21.30952380952381,
1448
+ "grad_norm": 13.026501655578613,
1449
+ "learning_rate": 6.472663139329807e-06,
1450
+ "loss": 0.1844,
1451
+ "step": 1790
1452
+ },
1453
+ {
1454
+ "epoch": 21.428571428571427,
1455
+ "grad_norm": 3.5783817768096924,
1456
+ "learning_rate": 6.384479717813051e-06,
1457
+ "loss": 0.1636,
1458
+ "step": 1800
1459
+ },
1460
+ {
1461
+ "epoch": 21.547619047619047,
1462
+ "grad_norm": 10.339872360229492,
1463
+ "learning_rate": 6.296296296296297e-06,
1464
+ "loss": 0.1766,
1465
+ "step": 1810
1466
+ },
1467
+ {
1468
+ "epoch": 21.666666666666668,
1469
+ "grad_norm": 13.413084030151367,
1470
+ "learning_rate": 6.208112874779542e-06,
1471
+ "loss": 0.1643,
1472
+ "step": 1820
1473
+ },
1474
+ {
1475
+ "epoch": 21.785714285714285,
1476
+ "grad_norm": 6.75968074798584,
1477
+ "learning_rate": 6.119929453262787e-06,
1478
+ "loss": 0.1589,
1479
+ "step": 1830
1480
+ },
1481
+ {
1482
+ "epoch": 21.904761904761905,
1483
+ "grad_norm": 3.4033656120300293,
1484
+ "learning_rate": 6.031746031746032e-06,
1485
+ "loss": 0.1569,
1486
+ "step": 1840
1487
+ },
1488
+ {
1489
+ "epoch": 22.0,
1490
+ "eval_accuracy": 0.93375,
1491
+ "eval_loss": 0.21772493422031403,
1492
+ "eval_runtime": 5.4159,
1493
+ "eval_samples_per_second": 147.714,
1494
+ "eval_steps_per_second": 18.464,
1495
+ "step": 1848
1496
+ },
1497
+ {
1498
+ "epoch": 22.023809523809526,
1499
+ "grad_norm": 28.122211456298828,
1500
+ "learning_rate": 5.943562610229277e-06,
1501
+ "loss": 0.1648,
1502
+ "step": 1850
1503
+ },
1504
+ {
1505
+ "epoch": 22.142857142857142,
1506
+ "grad_norm": 9.448488235473633,
1507
+ "learning_rate": 5.855379188712523e-06,
1508
+ "loss": 0.1397,
1509
+ "step": 1860
1510
+ },
1511
+ {
1512
+ "epoch": 22.261904761904763,
1513
+ "grad_norm": 7.661059379577637,
1514
+ "learning_rate": 5.767195767195768e-06,
1515
+ "loss": 0.1756,
1516
+ "step": 1870
1517
+ },
1518
+ {
1519
+ "epoch": 22.38095238095238,
1520
+ "grad_norm": 10.659194946289062,
1521
+ "learning_rate": 5.6790123456790125e-06,
1522
+ "loss": 0.1588,
1523
+ "step": 1880
1524
+ },
1525
+ {
1526
+ "epoch": 22.5,
1527
+ "grad_norm": 3.608710527420044,
1528
+ "learning_rate": 5.590828924162258e-06,
1529
+ "loss": 0.1656,
1530
+ "step": 1890
1531
+ },
1532
+ {
1533
+ "epoch": 22.61904761904762,
1534
+ "grad_norm": 10.583845138549805,
1535
+ "learning_rate": 5.502645502645503e-06,
1536
+ "loss": 0.1618,
1537
+ "step": 1900
1538
+ },
1539
+ {
1540
+ "epoch": 22.738095238095237,
1541
+ "grad_norm": 8.316349029541016,
1542
+ "learning_rate": 5.4144620811287486e-06,
1543
+ "loss": 0.1541,
1544
+ "step": 1910
1545
+ },
1546
+ {
1547
+ "epoch": 22.857142857142858,
1548
+ "grad_norm": 9.836575508117676,
1549
+ "learning_rate": 5.326278659611993e-06,
1550
+ "loss": 0.1588,
1551
+ "step": 1920
1552
+ },
1553
+ {
1554
+ "epoch": 22.976190476190474,
1555
+ "grad_norm": 7.263607501983643,
1556
+ "learning_rate": 5.2380952380952384e-06,
1557
+ "loss": 0.144,
1558
+ "step": 1930
1559
+ },
1560
+ {
1561
+ "epoch": 23.0,
1562
+ "eval_accuracy": 0.93375,
1563
+ "eval_loss": 0.21350684762001038,
1564
+ "eval_runtime": 5.4573,
1565
+ "eval_samples_per_second": 146.592,
1566
+ "eval_steps_per_second": 18.324,
1567
+ "step": 1932
1568
+ },
1569
+ {
1570
+ "epoch": 23.095238095238095,
1571
+ "grad_norm": 9.257414817810059,
1572
+ "learning_rate": 5.149911816578484e-06,
1573
+ "loss": 0.1624,
1574
+ "step": 1940
1575
+ },
1576
+ {
1577
+ "epoch": 23.214285714285715,
1578
+ "grad_norm": 23.810653686523438,
1579
+ "learning_rate": 5.061728395061729e-06,
1580
+ "loss": 0.1528,
1581
+ "step": 1950
1582
+ },
1583
+ {
1584
+ "epoch": 23.333333333333332,
1585
+ "grad_norm": 8.397602081298828,
1586
+ "learning_rate": 4.973544973544974e-06,
1587
+ "loss": 0.1506,
1588
+ "step": 1960
1589
+ },
1590
+ {
1591
+ "epoch": 23.452380952380953,
1592
+ "grad_norm": 3.9385476112365723,
1593
+ "learning_rate": 4.885361552028219e-06,
1594
+ "loss": 0.1427,
1595
+ "step": 1970
1596
+ },
1597
+ {
1598
+ "epoch": 23.571428571428573,
1599
+ "grad_norm": 9.915690422058105,
1600
+ "learning_rate": 4.7971781305114636e-06,
1601
+ "loss": 0.1548,
1602
+ "step": 1980
1603
+ },
1604
+ {
1605
+ "epoch": 23.69047619047619,
1606
+ "grad_norm": 10.970438003540039,
1607
+ "learning_rate": 4.708994708994709e-06,
1608
+ "loss": 0.165,
1609
+ "step": 1990
1610
+ },
1611
+ {
1612
+ "epoch": 23.80952380952381,
1613
+ "grad_norm": 12.917925834655762,
1614
+ "learning_rate": 4.620811287477954e-06,
1615
+ "loss": 0.1658,
1616
+ "step": 2000
1617
+ },
1618
+ {
1619
+ "epoch": 23.928571428571427,
1620
+ "grad_norm": 16.563697814941406,
1621
+ "learning_rate": 4.5326278659612e-06,
1622
+ "loss": 0.1581,
1623
+ "step": 2010
1624
+ },
1625
+ {
1626
+ "epoch": 24.0,
1627
+ "eval_accuracy": 0.9325,
1628
+ "eval_loss": 0.20941905677318573,
1629
+ "eval_runtime": 6.2026,
1630
+ "eval_samples_per_second": 128.978,
1631
+ "eval_steps_per_second": 16.122,
1632
+ "step": 2016
1633
+ },
1634
+ {
1635
+ "epoch": 24.047619047619047,
1636
+ "grad_norm": 5.663702487945557,
1637
+ "learning_rate": 4.444444444444444e-06,
1638
+ "loss": 0.1742,
1639
+ "step": 2020
1640
+ },
1641
+ {
1642
+ "epoch": 24.166666666666668,
1643
+ "grad_norm": 21.897550582885742,
1644
+ "learning_rate": 4.3562610229276895e-06,
1645
+ "loss": 0.1663,
1646
+ "step": 2030
1647
+ },
1648
+ {
1649
+ "epoch": 24.285714285714285,
1650
+ "grad_norm": 7.413302421569824,
1651
+ "learning_rate": 4.268077601410935e-06,
1652
+ "loss": 0.1478,
1653
+ "step": 2040
1654
+ },
1655
+ {
1656
+ "epoch": 24.404761904761905,
1657
+ "grad_norm": 9.5319242477417,
1658
+ "learning_rate": 4.17989417989418e-06,
1659
+ "loss": 0.1616,
1660
+ "step": 2050
1661
+ },
1662
+ {
1663
+ "epoch": 24.523809523809526,
1664
+ "grad_norm": 7.299759864807129,
1665
+ "learning_rate": 4.091710758377425e-06,
1666
+ "loss": 0.1601,
1667
+ "step": 2060
1668
+ },
1669
+ {
1670
+ "epoch": 24.642857142857142,
1671
+ "grad_norm": 7.582053184509277,
1672
+ "learning_rate": 4.00352733686067e-06,
1673
+ "loss": 0.1481,
1674
+ "step": 2070
1675
+ },
1676
+ {
1677
+ "epoch": 24.761904761904763,
1678
+ "grad_norm": 6.607138633728027,
1679
+ "learning_rate": 3.9153439153439155e-06,
1680
+ "loss": 0.1447,
1681
+ "step": 2080
1682
+ },
1683
+ {
1684
+ "epoch": 24.88095238095238,
1685
+ "grad_norm": 5.343249320983887,
1686
+ "learning_rate": 3.827160493827161e-06,
1687
+ "loss": 0.1366,
1688
+ "step": 2090
1689
+ },
1690
+ {
1691
+ "epoch": 25.0,
1692
+ "grad_norm": 11.388031005859375,
1693
+ "learning_rate": 3.7389770723104058e-06,
1694
+ "loss": 0.1426,
1695
+ "step": 2100
1696
+ },
1697
+ {
1698
+ "epoch": 25.0,
1699
+ "eval_accuracy": 0.935,
1700
+ "eval_loss": 0.2057761698961258,
1701
+ "eval_runtime": 6.4445,
1702
+ "eval_samples_per_second": 124.136,
1703
+ "eval_steps_per_second": 15.517,
1704
+ "step": 2100
1705
+ },
1706
+ {
1707
+ "epoch": 25.11904761904762,
1708
+ "grad_norm": 13.238511085510254,
1709
+ "learning_rate": 3.6507936507936507e-06,
1710
+ "loss": 0.1451,
1711
+ "step": 2110
1712
+ },
1713
+ {
1714
+ "epoch": 25.238095238095237,
1715
+ "grad_norm": 18.473859786987305,
1716
+ "learning_rate": 3.562610229276896e-06,
1717
+ "loss": 0.146,
1718
+ "step": 2120
1719
+ },
1720
+ {
1721
+ "epoch": 25.357142857142858,
1722
+ "grad_norm": 8.533565521240234,
1723
+ "learning_rate": 3.474426807760141e-06,
1724
+ "loss": 0.15,
1725
+ "step": 2130
1726
+ },
1727
+ {
1728
+ "epoch": 25.476190476190474,
1729
+ "grad_norm": 8.290474891662598,
1730
+ "learning_rate": 3.3862433862433864e-06,
1731
+ "loss": 0.1367,
1732
+ "step": 2140
1733
+ },
1734
+ {
1735
+ "epoch": 25.595238095238095,
1736
+ "grad_norm": 3.6419670581817627,
1737
+ "learning_rate": 3.2980599647266313e-06,
1738
+ "loss": 0.1392,
1739
+ "step": 2150
1740
+ },
1741
+ {
1742
+ "epoch": 25.714285714285715,
1743
+ "grad_norm": 16.031980514526367,
1744
+ "learning_rate": 3.2098765432098767e-06,
1745
+ "loss": 0.1496,
1746
+ "step": 2160
1747
+ },
1748
+ {
1749
+ "epoch": 25.833333333333332,
1750
+ "grad_norm": 16.360395431518555,
1751
+ "learning_rate": 3.1216931216931216e-06,
1752
+ "loss": 0.1705,
1753
+ "step": 2170
1754
+ },
1755
+ {
1756
+ "epoch": 25.952380952380953,
1757
+ "grad_norm": 7.6555280685424805,
1758
+ "learning_rate": 3.033509700176367e-06,
1759
+ "loss": 0.1409,
1760
+ "step": 2180
1761
+ },
1762
+ {
1763
+ "epoch": 26.0,
1764
+ "eval_accuracy": 0.93375,
1765
+ "eval_loss": 0.2027110904455185,
1766
+ "eval_runtime": 6.4093,
1767
+ "eval_samples_per_second": 124.818,
1768
+ "eval_steps_per_second": 15.602,
1769
+ "step": 2184
1770
+ },
1771
+ {
1772
+ "epoch": 26.071428571428573,
1773
+ "grad_norm": 13.298584938049316,
1774
+ "learning_rate": 2.945326278659612e-06,
1775
+ "loss": 0.1339,
1776
+ "step": 2190
1777
+ },
1778
+ {
1779
+ "epoch": 26.19047619047619,
1780
+ "grad_norm": 5.4013800621032715,
1781
+ "learning_rate": 2.8571428571428573e-06,
1782
+ "loss": 0.1388,
1783
+ "step": 2200
1784
+ },
1785
+ {
1786
+ "epoch": 26.30952380952381,
1787
+ "grad_norm": 6.846310138702393,
1788
+ "learning_rate": 2.768959435626102e-06,
1789
+ "loss": 0.1379,
1790
+ "step": 2210
1791
+ },
1792
+ {
1793
+ "epoch": 26.428571428571427,
1794
+ "grad_norm": 8.436290740966797,
1795
+ "learning_rate": 2.6807760141093476e-06,
1796
+ "loss": 0.1404,
1797
+ "step": 2220
1798
+ },
1799
+ {
1800
+ "epoch": 26.547619047619047,
1801
+ "grad_norm": 8.173290252685547,
1802
+ "learning_rate": 2.5925925925925925e-06,
1803
+ "loss": 0.1526,
1804
+ "step": 2230
1805
+ },
1806
+ {
1807
+ "epoch": 26.666666666666668,
1808
+ "grad_norm": 7.008592128753662,
1809
+ "learning_rate": 2.504409171075838e-06,
1810
+ "loss": 0.1656,
1811
+ "step": 2240
1812
+ },
1813
+ {
1814
+ "epoch": 26.785714285714285,
1815
+ "grad_norm": 14.528263092041016,
1816
+ "learning_rate": 2.416225749559083e-06,
1817
+ "loss": 0.1403,
1818
+ "step": 2250
1819
+ },
1820
+ {
1821
+ "epoch": 26.904761904761905,
1822
+ "grad_norm": 19.173906326293945,
1823
+ "learning_rate": 2.328042328042328e-06,
1824
+ "loss": 0.1445,
1825
+ "step": 2260
1826
+ },
1827
+ {
1828
+ "epoch": 27.0,
1829
+ "eval_accuracy": 0.9275,
1830
+ "eval_loss": 0.20720510184764862,
1831
+ "eval_runtime": 5.4406,
1832
+ "eval_samples_per_second": 147.044,
1833
+ "eval_steps_per_second": 18.38,
1834
+ "step": 2268
1835
+ },
1836
+ {
1837
+ "epoch": 27.023809523809526,
1838
+ "grad_norm": 7.463446140289307,
1839
+ "learning_rate": 2.239858906525573e-06,
1840
+ "loss": 0.1377,
1841
+ "step": 2270
1842
+ },
1843
+ {
1844
+ "epoch": 27.142857142857142,
1845
+ "grad_norm": 11.728429794311523,
1846
+ "learning_rate": 2.1516754850088184e-06,
1847
+ "loss": 0.1315,
1848
+ "step": 2280
1849
+ },
1850
+ {
1851
+ "epoch": 27.261904761904763,
1852
+ "grad_norm": 4.965158939361572,
1853
+ "learning_rate": 2.0634920634920634e-06,
1854
+ "loss": 0.1466,
1855
+ "step": 2290
1856
+ },
1857
+ {
1858
+ "epoch": 27.38095238095238,
1859
+ "grad_norm": 5.408288955688477,
1860
+ "learning_rate": 1.9753086419753087e-06,
1861
+ "loss": 0.1418,
1862
+ "step": 2300
1863
+ },
1864
+ {
1865
+ "epoch": 27.5,
1866
+ "grad_norm": 5.822543621063232,
1867
+ "learning_rate": 1.887125220458554e-06,
1868
+ "loss": 0.1189,
1869
+ "step": 2310
1870
+ },
1871
+ {
1872
+ "epoch": 27.61904761904762,
1873
+ "grad_norm": 16.357952117919922,
1874
+ "learning_rate": 1.798941798941799e-06,
1875
+ "loss": 0.1595,
1876
+ "step": 2320
1877
+ },
1878
+ {
1879
+ "epoch": 27.738095238095237,
1880
+ "grad_norm": 7.142815589904785,
1881
+ "learning_rate": 1.7107583774250442e-06,
1882
+ "loss": 0.1362,
1883
+ "step": 2330
1884
+ },
1885
+ {
1886
+ "epoch": 27.857142857142858,
1887
+ "grad_norm": 3.4048264026641846,
1888
+ "learning_rate": 1.6225749559082893e-06,
1889
+ "loss": 0.1226,
1890
+ "step": 2340
1891
+ },
1892
+ {
1893
+ "epoch": 27.976190476190474,
1894
+ "grad_norm": 3.4568896293640137,
1895
+ "learning_rate": 1.5343915343915345e-06,
1896
+ "loss": 0.1472,
1897
+ "step": 2350
1898
+ },
1899
+ {
1900
+ "epoch": 28.0,
1901
+ "eval_accuracy": 0.93375,
1902
+ "eval_loss": 0.2012936919927597,
1903
+ "eval_runtime": 5.3691,
1904
+ "eval_samples_per_second": 149.0,
1905
+ "eval_steps_per_second": 18.625,
1906
+ "step": 2352
1907
+ },
1908
+ {
1909
+ "epoch": 28.095238095238095,
1910
+ "grad_norm": 25.373470306396484,
1911
+ "learning_rate": 1.4462081128747796e-06,
1912
+ "loss": 0.1368,
1913
+ "step": 2360
1914
+ },
1915
+ {
1916
+ "epoch": 28.214285714285715,
1917
+ "grad_norm": 14.415323257446289,
1918
+ "learning_rate": 1.3580246913580248e-06,
1919
+ "loss": 0.145,
1920
+ "step": 2370
1921
+ },
1922
+ {
1923
+ "epoch": 28.333333333333332,
1924
+ "grad_norm": 9.555523872375488,
1925
+ "learning_rate": 1.26984126984127e-06,
1926
+ "loss": 0.1556,
1927
+ "step": 2380
1928
+ },
1929
+ {
1930
+ "epoch": 28.452380952380953,
1931
+ "grad_norm": 13.204785346984863,
1932
+ "learning_rate": 1.181657848324515e-06,
1933
+ "loss": 0.1495,
1934
+ "step": 2390
1935
+ },
1936
+ {
1937
+ "epoch": 28.571428571428573,
1938
+ "grad_norm": 13.89194393157959,
1939
+ "learning_rate": 1.0934744268077602e-06,
1940
+ "loss": 0.1221,
1941
+ "step": 2400
1942
+ },
1943
+ {
1944
+ "epoch": 28.69047619047619,
1945
+ "grad_norm": 21.069202423095703,
1946
+ "learning_rate": 1.0052910052910054e-06,
1947
+ "loss": 0.1376,
1948
+ "step": 2410
1949
+ },
1950
+ {
1951
+ "epoch": 28.80952380952381,
1952
+ "grad_norm": 13.420437812805176,
1953
+ "learning_rate": 9.171075837742504e-07,
1954
+ "loss": 0.135,
1955
+ "step": 2420
1956
+ },
1957
+ {
1958
+ "epoch": 28.928571428571427,
1959
+ "grad_norm": 6.938393592834473,
1960
+ "learning_rate": 8.289241622574956e-07,
1961
+ "loss": 0.1329,
1962
+ "step": 2430
1963
+ },
1964
+ {
1965
+ "epoch": 29.0,
1966
+ "eval_accuracy": 0.9375,
1967
+ "eval_loss": 0.19804717600345612,
1968
+ "eval_runtime": 5.2492,
1969
+ "eval_samples_per_second": 152.406,
1970
+ "eval_steps_per_second": 19.051,
1971
+ "step": 2436
1972
+ },
1973
+ {
1974
+ "epoch": 29.047619047619047,
1975
+ "grad_norm": 6.732052803039551,
1976
+ "learning_rate": 7.407407407407407e-07,
1977
+ "loss": 0.1373,
1978
+ "step": 2440
1979
+ },
1980
+ {
1981
+ "epoch": 29.166666666666668,
1982
+ "grad_norm": 11.9608154296875,
1983
+ "learning_rate": 6.525573192239859e-07,
1984
+ "loss": 0.1288,
1985
+ "step": 2450
1986
+ },
1987
+ {
1988
+ "epoch": 29.285714285714285,
1989
+ "grad_norm": 9.937846183776855,
1990
+ "learning_rate": 5.64373897707231e-07,
1991
+ "loss": 0.1267,
1992
+ "step": 2460
1993
+ },
1994
+ {
1995
+ "epoch": 29.404761904761905,
1996
+ "grad_norm": 6.138334274291992,
1997
+ "learning_rate": 4.7619047619047623e-07,
1998
+ "loss": 0.1358,
1999
+ "step": 2470
2000
+ },
2001
+ {
2002
+ "epoch": 29.523809523809526,
2003
+ "grad_norm": 5.230373859405518,
2004
+ "learning_rate": 3.880070546737214e-07,
2005
+ "loss": 0.1473,
2006
+ "step": 2480
2007
+ },
2008
+ {
2009
+ "epoch": 29.642857142857142,
2010
+ "grad_norm": 8.851503372192383,
2011
+ "learning_rate": 2.9982363315696647e-07,
2012
+ "loss": 0.1507,
2013
+ "step": 2490
2014
+ },
2015
+ {
2016
+ "epoch": 29.761904761904763,
2017
+ "grad_norm": 5.106894493103027,
2018
+ "learning_rate": 2.1164021164021165e-07,
2019
+ "loss": 0.1262,
2020
+ "step": 2500
2021
+ },
2022
+ {
2023
+ "epoch": 29.88095238095238,
2024
+ "grad_norm": 9.42648696899414,
2025
+ "learning_rate": 1.234567901234568e-07,
2026
+ "loss": 0.1397,
2027
+ "step": 2510
2028
+ },
2029
+ {
2030
+ "epoch": 30.0,
2031
+ "grad_norm": 8.953765869140625,
2032
+ "learning_rate": 3.527336860670194e-08,
2033
+ "loss": 0.132,
2034
+ "step": 2520
2035
+ },
2036
+ {
2037
+ "epoch": 30.0,
2038
+ "eval_accuracy": 0.93625,
2039
+ "eval_loss": 0.1968877911567688,
2040
+ "eval_runtime": 5.803,
2041
+ "eval_samples_per_second": 137.859,
2042
+ "eval_steps_per_second": 17.232,
2043
+ "step": 2520
2044
+ }
2045
+ ],
2046
+ "logging_steps": 10,
2047
+ "max_steps": 2520,
2048
+ "num_input_tokens_seen": 0,
2049
+ "num_train_epochs": 30,
2050
+ "save_steps": 500,
2051
+ "stateful_callbacks": {
2052
+ "TrainerControl": {
2053
+ "args": {
2054
+ "should_epoch_stop": false,
2055
+ "should_evaluate": false,
2056
+ "should_log": false,
2057
+ "should_save": true,
2058
+ "should_training_stop": true
2059
+ },
2060
+ "attributes": {}
2061
+ }
2062
+ },
2063
+ "total_flos": 1.019477021294592e+19,
2064
+ "train_batch_size": 192,
2065
+ "trial_name": null,
2066
+ "trial_params": null
2067
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d043779ba98c244cf21fe965ab08cc6f63a07c2d49949c74e46624c71feca47f
3
+ size 5112