robertou2 commited on
Commit
d107b8d
·
verified ·
1 Parent(s): 1416eb1

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -24,9 +24,9 @@
24
  "revision": null,
25
  "target_modules": [
26
  "gate_up_proj",
27
- "o_proj",
28
  "qkv_proj",
29
- "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
24
  "revision": null,
25
  "target_modules": [
26
  "gate_up_proj",
27
+ "down_proj",
28
  "qkv_proj",
29
+ "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67cfeac19a2419a44a38965724a3e10984b3cfb02dc513e9d9c24a26148c9165
3
  size 100697728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4de8118eb3c068f60ce36d3ca641fe0eab10557be8ebd26263b7bf370c25545e
3
  size 100697728
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 27.0,
3
- "total_flos": 1.3785203655910195e+17,
4
- "train_loss": 0.12853161850599226,
5
- "train_runtime": 2497.1205,
6
- "train_samples_per_second": 1.802,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 2.5642849233567744e+16,
4
+ "train_loss": 0.5138080072402954,
5
+ "train_runtime": 465.5561,
6
+ "train_samples_per_second": 1.718,
7
+ "train_steps_per_second": 0.107
8
  }
checkpoint-50/adapter_config.json CHANGED
@@ -23,10 +23,10 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "qkv_proj",
27
- "o_proj",
28
  "down_proj",
29
- "gate_up_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "gate_up_proj",
 
27
  "down_proj",
28
+ "qkv_proj",
29
+ "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
checkpoint-50/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:128026d7a03702aaee46fa45f35639b5ff805f54560f6fa6a67213b3a3deabbf
3
  size 100697728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4de8118eb3c068f60ce36d3ca641fe0eab10557be8ebd26263b7bf370c25545e
3
  size 100697728
checkpoint-50/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c491a3247a7ee5eea21a2e2b697593d159b3eb60006237830144a20053c047d5
3
  size 201541754
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f767fbe59969feec5bcec50d5fab9da54c2d7ca23cdccf7d44603ec638346f2
3
  size 201541754
checkpoint-50/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d72bf840684c4ffeae56d99ac7069162bd15b8d308b33e5cdefaf8d7910aea0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20acd0328870e82721a4e81751e637c1df5076c8b6301d6bc1828612b34f862b
3
  size 14244
checkpoint-50/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_global_step": 48,
3
- "best_metric": 0.6318144798278809,
4
- "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-48",
5
- "epoch": 4.173913043478261,
6
  "eval_steps": 500,
7
  "global_step": 50,
8
  "is_hyper_param_search": false,
@@ -10,120 +10,80 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.43478260869565216,
14
- "grad_norm": 0.3567487597465515,
15
- "learning_rate": 4e-05,
16
- "loss": 0.7927,
17
  "step": 5
18
  },
19
- {
20
- "epoch": 0.8695652173913043,
21
- "grad_norm": 0.20804046094417572,
22
- "learning_rate": 8e-05,
23
- "loss": 0.877,
24
- "step": 10
25
- },
26
  {
27
  "epoch": 1.0,
28
- "eval_loss": 0.7741447687149048,
29
- "eval_runtime": 3.8872,
30
- "eval_samples_per_second": 5.145,
31
- "eval_steps_per_second": 0.772,
32
- "step": 12
33
  },
34
  {
35
- "epoch": 1.2608695652173914,
36
- "grad_norm": 0.14667491614818573,
37
- "learning_rate": 0.00012,
38
- "loss": 0.797,
39
  "step": 15
40
  },
41
- {
42
- "epoch": 1.6956521739130435,
43
- "grad_norm": 0.18651264905929565,
44
- "learning_rate": 0.00016,
45
- "loss": 0.6654,
46
- "step": 20
47
- },
48
  {
49
  "epoch": 2.0,
50
- "eval_loss": 0.6869648694992065,
51
- "eval_runtime": 3.8967,
52
- "eval_samples_per_second": 5.132,
53
- "eval_steps_per_second": 0.77,
54
- "step": 24
55
  },
56
  {
57
- "epoch": 2.0869565217391304,
58
- "grad_norm": 0.18070296943187714,
59
- "learning_rate": 0.0002,
60
- "loss": 0.6351,
61
  "step": 25
62
  },
63
  {
64
- "epoch": 2.5217391304347827,
65
- "grad_norm": 0.23248770833015442,
66
- "learning_rate": 0.00018090169943749476,
67
- "loss": 0.6273,
68
  "step": 30
69
  },
70
  {
71
- "epoch": 2.9565217391304346,
72
- "grad_norm": 0.19235172867774963,
73
- "learning_rate": 0.00013090169943749476,
74
- "loss": 0.599,
75
  "step": 35
76
  },
77
  {
78
- "epoch": 3.0,
79
- "eval_loss": 0.6456981301307678,
80
- "eval_runtime": 3.8903,
81
- "eval_samples_per_second": 5.141,
82
- "eval_steps_per_second": 0.771,
83
- "step": 36
84
- },
85
- {
86
- "epoch": 3.3478260869565215,
87
- "grad_norm": 0.18532131612300873,
88
- "learning_rate": 6.909830056250527e-05,
89
- "loss": 0.5887,
90
  "step": 40
91
  },
92
  {
93
- "epoch": 3.782608695652174,
94
- "grad_norm": 0.23707902431488037,
95
- "learning_rate": 1.9098300562505266e-05,
96
- "loss": 0.5482,
97
  "step": 45
98
  },
99
  {
100
- "epoch": 4.0,
101
- "eval_loss": 0.6318144798278809,
102
- "eval_runtime": 3.8893,
103
- "eval_samples_per_second": 5.142,
104
- "eval_steps_per_second": 0.771,
105
- "step": 48
106
- },
107
- {
108
- "epoch": 4.173913043478261,
109
- "grad_norm": 0.20858274400234222,
110
  "learning_rate": 0.0,
111
- "loss": 0.5586,
112
- "step": 50
113
- },
114
- {
115
- "epoch": 4.173913043478261,
116
- "eval_loss": 0.6320340037345886,
117
- "eval_runtime": 3.8925,
118
- "eval_samples_per_second": 5.138,
119
- "eval_steps_per_second": 0.771,
120
  "step": 50
121
  }
122
  ],
123
  "logging_steps": 5,
124
  "max_steps": 50,
125
  "num_input_tokens_seen": 0,
126
- "num_train_epochs": 5,
127
  "save_steps": 500,
128
  "stateful_callbacks": {
129
  "TrainerControl": {
@@ -137,7 +97,7 @@
137
  "attributes": {}
138
  }
139
  },
140
- "total_flos": 2.8914967713060864e+16,
141
  "train_batch_size": 2,
142
  "trial_name": null,
143
  "trial_params": null
 
1
  {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 50,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.5333333333333333,
14
+ "grad_norm": 0.21942713856697083,
15
+ "learning_rate": 0.000199107748815478,
16
+ "loss": 0.7714,
17
  "step": 5
18
  },
 
 
 
 
 
 
 
19
  {
20
  "epoch": 1.0,
21
+ "grad_norm": 0.27160170674324036,
22
+ "learning_rate": 0.00018925188358598813,
23
+ "loss": 0.6282,
24
+ "step": 10
 
25
  },
26
  {
27
+ "epoch": 1.5333333333333332,
28
+ "grad_norm": 0.16776053607463837,
29
+ "learning_rate": 0.00016951924276746425,
30
+ "loss": 0.5601,
31
  "step": 15
32
  },
 
 
 
 
 
 
 
33
  {
34
  "epoch": 2.0,
35
+ "grad_norm": 0.2320907711982727,
36
+ "learning_rate": 0.0001420934762428335,
37
+ "loss": 0.5471,
38
+ "step": 20
 
39
  },
40
  {
41
+ "epoch": 2.533333333333333,
42
+ "grad_norm": 0.15466105937957764,
43
+ "learning_rate": 0.00011000956916240985,
44
+ "loss": 0.4923,
45
  "step": 25
46
  },
47
  {
48
+ "epoch": 3.0,
49
+ "grad_norm": 0.31745201349258423,
50
+ "learning_rate": 7.681798497324716e-05,
51
+ "loss": 0.4651,
52
  "step": 30
53
  },
54
  {
55
+ "epoch": 3.533333333333333,
56
+ "grad_norm": 0.19490917026996613,
57
+ "learning_rate": 4.6191764683662744e-05,
58
+ "loss": 0.4367,
59
  "step": 35
60
  },
61
  {
62
+ "epoch": 4.0,
63
+ "grad_norm": 0.3345278799533844,
64
+ "learning_rate": 2.1520061472133902e-05,
65
+ "loss": 0.4268,
 
 
 
 
 
 
 
 
66
  "step": 40
67
  },
68
  {
69
+ "epoch": 4.533333333333333,
70
+ "grad_norm": 0.156062051653862,
71
+ "learning_rate": 5.533090839208133e-06,
72
+ "loss": 0.402,
73
  "step": 45
74
  },
75
  {
76
+ "epoch": 5.0,
77
+ "grad_norm": 0.24310481548309326,
 
 
 
 
 
 
 
 
78
  "learning_rate": 0.0,
79
+ "loss": 0.4084,
 
 
 
 
 
 
 
 
80
  "step": 50
81
  }
82
  ],
83
  "logging_steps": 5,
84
  "max_steps": 50,
85
  "num_input_tokens_seen": 0,
86
+ "num_train_epochs": 6,
87
  "save_steps": 500,
88
  "stateful_callbacks": {
89
  "TrainerControl": {
 
97
  "attributes": {}
98
  }
99
  },
100
+ "total_flos": 2.5642849233567744e+16,
101
  "train_batch_size": 2,
102
  "trial_name": null,
103
  "trial_params": null
checkpoint-50/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b379700727532ab22acd204ba858eb7dbdb3c8a9496a4558aed006ad9fda0ad3
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfc8b2f6195776ee0127d015bd85f458c29181888c317a3105ff4995afd4007f
3
  size 5624
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 27.0,
3
- "total_flos": 1.3785203655910195e+17,
4
- "train_loss": 0.12853161850599226,
5
- "train_runtime": 2497.1205,
6
- "train_samples_per_second": 1.802,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 2.5642849233567744e+16,
4
+ "train_loss": 0.5138080072402954,
5
+ "train_runtime": 465.5561,
6
+ "train_samples_per_second": 1.718,
7
+ "train_steps_per_second": 0.107
8
  }
trainer_state.json CHANGED
@@ -2,405 +2,97 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 27.0,
6
  "eval_steps": 500,
7
- "global_step": 270,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.5333333333333333,
14
- "grad_norm": 0.3464358448982239,
15
- "learning_rate": 7.142857142857143e-05,
16
- "loss": 0.7933,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 1.0,
21
- "grad_norm": 0.2775850296020508,
22
- "learning_rate": 0.00014285714285714287,
23
- "loss": 0.6763,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 1.5333333333333332,
28
- "grad_norm": 0.20196351408958435,
29
- "learning_rate": 0.00019999247018391447,
30
- "loss": 0.612,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 2.0,
35
- "grad_norm": 0.20944692194461823,
36
- "learning_rate": 0.00019972904566786903,
37
- "loss": 0.593,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 2.533333333333333,
42
- "grad_norm": 0.1796199381351471,
43
- "learning_rate": 0.000199090263542778,
44
- "loss": 0.528,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 3.0,
49
- "grad_norm": 0.3253108859062195,
50
- "learning_rate": 0.00019807852804032305,
51
- "loss": 0.4901,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 3.533333333333333,
56
- "grad_norm": 0.2155134528875351,
57
- "learning_rate": 0.00019669764710448522,
58
- "loss": 0.4385,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 4.0,
63
- "grad_norm": 0.40249642729759216,
64
- "learning_rate": 0.00019495281805930367,
65
- "loss": 0.4146,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 4.533333333333333,
70
- "grad_norm": 0.21931566298007965,
71
- "learning_rate": 0.00019285060804732158,
72
- "loss": 0.3422,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 5.0,
77
- "grad_norm": 0.47045040130615234,
78
- "learning_rate": 0.00019039892931234435,
79
- "loss": 0.334,
80
- "step": 50
81
- },
82
- {
83
- "epoch": 5.533333333333333,
84
- "grad_norm": 0.33896583318710327,
85
- "learning_rate": 0.00018760700941954065,
86
- "loss": 0.3055,
87
- "step": 55
88
- },
89
- {
90
- "epoch": 6.0,
91
- "grad_norm": 0.6106130480766296,
92
- "learning_rate": 0.00018448535652497073,
93
- "loss": 0.2364,
94
- "step": 60
95
- },
96
- {
97
- "epoch": 6.533333333333333,
98
- "grad_norm": 0.34181615710258484,
99
- "learning_rate": 0.0001810457198252595,
100
- "loss": 0.19,
101
- "step": 65
102
- },
103
- {
104
- "epoch": 7.0,
105
- "grad_norm": 0.8446937799453735,
106
- "learning_rate": 0.0001773010453362737,
107
- "loss": 0.1965,
108
- "step": 70
109
- },
110
- {
111
- "epoch": 7.533333333333333,
112
- "grad_norm": 0.5083507299423218,
113
- "learning_rate": 0.00017326542716724128,
114
- "loss": 0.1207,
115
- "step": 75
116
- },
117
- {
118
- "epoch": 8.0,
119
- "grad_norm": 0.7833607196807861,
120
- "learning_rate": 0.0001689540544737067,
121
- "loss": 0.1283,
122
- "step": 80
123
- },
124
- {
125
- "epoch": 8.533333333333333,
126
- "grad_norm": 0.3553655743598938,
127
- "learning_rate": 0.00016438315428897915,
128
- "loss": 0.0811,
129
- "step": 85
130
- },
131
- {
132
- "epoch": 9.0,
133
- "grad_norm": 0.7091866135597229,
134
- "learning_rate": 0.00015956993044924334,
135
- "loss": 0.0802,
136
- "step": 90
137
- },
138
- {
139
- "epoch": 9.533333333333333,
140
- "grad_norm": 0.43794405460357666,
141
- "learning_rate": 0.00015453249884220464,
142
- "loss": 0.0492,
143
- "step": 95
144
- },
145
- {
146
- "epoch": 10.0,
147
- "grad_norm": 0.5798842310905457,
148
- "learning_rate": 0.00014928981922297842,
149
- "loss": 0.0524,
150
- "step": 100
151
- },
152
- {
153
- "epoch": 10.533333333333333,
154
- "grad_norm": 0.3912927210330963,
155
- "learning_rate": 0.00014386162385385278,
156
- "loss": 0.0302,
157
- "step": 105
158
- },
159
- {
160
- "epoch": 11.0,
161
- "grad_norm": 0.4159412682056427,
162
- "learning_rate": 0.000138268343236509,
163
- "loss": 0.0328,
164
- "step": 110
165
- },
166
- {
167
- "epoch": 11.533333333333333,
168
- "grad_norm": 0.24435237050056458,
169
- "learning_rate": 0.0001325310292162263,
170
- "loss": 0.0239,
171
- "step": 115
172
- },
173
- {
174
- "epoch": 12.0,
175
- "grad_norm": 0.24223262071609497,
176
- "learning_rate": 0.00012667127574748986,
177
- "loss": 0.0171,
178
- "step": 120
179
- },
180
- {
181
- "epoch": 12.533333333333333,
182
- "grad_norm": 0.1762484759092331,
183
- "learning_rate": 0.00012071113761922186,
184
- "loss": 0.0135,
185
- "step": 125
186
- },
187
- {
188
- "epoch": 13.0,
189
- "grad_norm": 0.28046339750289917,
190
- "learning_rate": 0.00011467304744553618,
191
- "loss": 0.0143,
192
- "step": 130
193
- },
194
- {
195
- "epoch": 13.533333333333333,
196
- "grad_norm": 0.28064408898353577,
197
- "learning_rate": 0.000108579731234444,
198
- "loss": 0.0128,
199
- "step": 135
200
- },
201
- {
202
- "epoch": 14.0,
203
- "grad_norm": 0.19103752076625824,
204
- "learning_rate": 0.00010245412285229124,
205
- "loss": 0.0101,
206
- "step": 140
207
- },
208
- {
209
- "epoch": 14.533333333333333,
210
- "grad_norm": 0.2761705815792084,
211
- "learning_rate": 9.631927770586412e-05,
212
- "loss": 0.0096,
213
- "step": 145
214
- },
215
- {
216
- "epoch": 15.0,
217
- "grad_norm": 0.3451383709907532,
218
- "learning_rate": 9.019828596704394e-05,
219
- "loss": 0.0084,
220
- "step": 150
221
- },
222
- {
223
- "epoch": 15.533333333333333,
224
- "grad_norm": 0.22912859916687012,
225
- "learning_rate": 8.411418566661388e-05,
226
- "loss": 0.0087,
227
- "step": 155
228
- },
229
- {
230
- "epoch": 16.0,
231
- "grad_norm": 0.2961307466030121,
232
- "learning_rate": 7.808987598431303e-05,
233
- "loss": 0.0054,
234
- "step": 160
235
- },
236
- {
237
- "epoch": 16.533333333333335,
238
- "grad_norm": 0.05174371972680092,
239
- "learning_rate": 7.21480310614947e-05,
240
- "loss": 0.0037,
241
- "step": 165
242
- },
243
- {
244
- "epoch": 17.0,
245
- "grad_norm": 0.6668692827224731,
246
- "learning_rate": 6.6311014660778e-05,
247
- "loss": 0.0086,
248
- "step": 170
249
- },
250
- {
251
- "epoch": 17.533333333333335,
252
- "grad_norm": 0.0423920676112175,
253
- "learning_rate": 6.060079599389521e-05,
254
- "loss": 0.0046,
255
- "step": 175
256
- },
257
- {
258
- "epoch": 18.0,
259
- "grad_norm": 0.048323437571525574,
260
- "learning_rate": 5.503886703453933e-05,
261
- "loss": 0.005,
262
- "step": 180
263
- },
264
- {
265
- "epoch": 18.533333333333335,
266
- "grad_norm": 0.14866454899311066,
267
- "learning_rate": 4.964616162742826e-05,
268
- "loss": 0.0039,
269
- "step": 185
270
- },
271
- {
272
- "epoch": 19.0,
273
- "grad_norm": 0.04198712110519409,
274
- "learning_rate": 4.444297669803981e-05,
275
- "loss": 0.0033,
276
- "step": 190
277
- },
278
- {
279
- "epoch": 19.533333333333335,
280
- "grad_norm": 0.037916265428066254,
281
- "learning_rate": 3.944889585956746e-05,
282
- "loss": 0.0075,
283
- "step": 195
284
- },
285
- {
286
- "epoch": 20.0,
287
- "grad_norm": 0.031426794826984406,
288
- "learning_rate": 3.468271570462235e-05,
289
- "loss": 0.0036,
290
- "step": 200
291
- },
292
- {
293
- "epoch": 20.533333333333335,
294
- "grad_norm": 0.1819785088300705,
295
- "learning_rate": 3.016237505910272e-05,
296
- "loss": 0.0062,
297
- "step": 205
298
- },
299
- {
300
- "epoch": 21.0,
301
- "grad_norm": 0.09462948143482208,
302
- "learning_rate": 2.5904887464504114e-05,
303
- "loss": 0.0034,
304
- "step": 210
305
- },
306
- {
307
- "epoch": 21.533333333333335,
308
- "grad_norm": 0.11672957241535187,
309
- "learning_rate": 2.1926277142790552e-05,
310
- "loss": 0.0025,
311
- "step": 215
312
- },
313
- {
314
- "epoch": 22.0,
315
- "grad_norm": 0.04571106657385826,
316
- "learning_rate": 1.824151868484164e-05,
317
- "loss": 0.0042,
318
- "step": 220
319
- },
320
- {
321
- "epoch": 22.533333333333335,
322
- "grad_norm": 0.019737839698791504,
323
- "learning_rate": 1.486448068947348e-05,
324
- "loss": 0.0021,
325
- "step": 225
326
- },
327
- {
328
- "epoch": 23.0,
329
- "grad_norm": 0.03002820909023285,
330
- "learning_rate": 1.1807873565164506e-05,
331
- "loss": 0.0049,
332
- "step": 230
333
- },
334
- {
335
- "epoch": 23.533333333333335,
336
- "grad_norm": 0.24355602264404297,
337
- "learning_rate": 9.083201690947763e-06,
338
- "loss": 0.004,
339
- "step": 235
340
- },
341
- {
342
- "epoch": 24.0,
343
- "grad_norm": 0.035868410021066666,
344
- "learning_rate": 6.700720116526116e-06,
345
- "loss": 0.0024,
346
- "step": 240
347
- },
348
- {
349
- "epoch": 24.533333333333335,
350
- "grad_norm": 0.029756512492895126,
351
- "learning_rate": 4.669395964580614e-06,
352
- "loss": 0.0025,
353
- "step": 245
354
- },
355
- {
356
- "epoch": 25.0,
357
- "grad_norm": 0.2836012542247772,
358
- "learning_rate": 2.996874680545603e-06,
359
- "loss": 0.0059,
360
- "step": 250
361
- },
362
- {
363
- "epoch": 25.533333333333335,
364
- "grad_norm": 0.2209901213645935,
365
- "learning_rate": 1.6894512568783716e-06,
366
- "loss": 0.004,
367
- "step": 255
368
- },
369
- {
370
- "epoch": 26.0,
371
- "grad_norm": 0.023770242929458618,
372
- "learning_rate": 7.520465401290033e-07,
373
- "loss": 0.0033,
374
- "step": 260
375
- },
376
- {
377
- "epoch": 26.533333333333335,
378
- "grad_norm": 0.02641889452934265,
379
- "learning_rate": 1.8818870998508208e-07,
380
- "loss": 0.0032,
381
- "step": 265
382
- },
383
- {
384
- "epoch": 27.0,
385
- "grad_norm": 0.03176680952310562,
386
  "learning_rate": 0.0,
387
- "loss": 0.0025,
388
- "step": 270
389
  },
390
  {
391
- "epoch": 27.0,
392
- "step": 270,
393
- "total_flos": 1.3785203655910195e+17,
394
- "train_loss": 0.12853161850599226,
395
- "train_runtime": 2497.1205,
396
- "train_samples_per_second": 1.802,
397
- "train_steps_per_second": 0.108
398
  }
399
  ],
400
  "logging_steps": 5,
401
- "max_steps": 270,
402
  "num_input_tokens_seen": 0,
403
- "num_train_epochs": 30,
404
  "save_steps": 500,
405
  "stateful_callbacks": {
406
  "TrainerControl": {
@@ -414,7 +106,7 @@
414
  "attributes": {}
415
  }
416
  },
417
- "total_flos": 1.3785203655910195e+17,
418
  "train_batch_size": 2,
419
  "trial_name": null,
420
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
  "eval_steps": 500,
7
+ "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.5333333333333333,
14
+ "grad_norm": 0.21942713856697083,
15
+ "learning_rate": 0.000199107748815478,
16
+ "loss": 0.7714,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 1.0,
21
+ "grad_norm": 0.27160170674324036,
22
+ "learning_rate": 0.00018925188358598813,
23
+ "loss": 0.6282,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 1.5333333333333332,
28
+ "grad_norm": 0.16776053607463837,
29
+ "learning_rate": 0.00016951924276746425,
30
+ "loss": 0.5601,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 2.0,
35
+ "grad_norm": 0.2320907711982727,
36
+ "learning_rate": 0.0001420934762428335,
37
+ "loss": 0.5471,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 2.533333333333333,
42
+ "grad_norm": 0.15466105937957764,
43
+ "learning_rate": 0.00011000956916240985,
44
+ "loss": 0.4923,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 3.0,
49
+ "grad_norm": 0.31745201349258423,
50
+ "learning_rate": 7.681798497324716e-05,
51
+ "loss": 0.4651,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 3.533333333333333,
56
+ "grad_norm": 0.19490917026996613,
57
+ "learning_rate": 4.6191764683662744e-05,
58
+ "loss": 0.4367,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 4.0,
63
+ "grad_norm": 0.3345278799533844,
64
+ "learning_rate": 2.1520061472133902e-05,
65
+ "loss": 0.4268,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 4.533333333333333,
70
+ "grad_norm": 0.156062051653862,
71
+ "learning_rate": 5.533090839208133e-06,
72
+ "loss": 0.402,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 5.0,
77
+ "grad_norm": 0.24310481548309326,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  "learning_rate": 0.0,
79
+ "loss": 0.4084,
80
+ "step": 50
81
  },
82
  {
83
+ "epoch": 5.0,
84
+ "step": 50,
85
+ "total_flos": 2.5642849233567744e+16,
86
+ "train_loss": 0.5138080072402954,
87
+ "train_runtime": 465.5561,
88
+ "train_samples_per_second": 1.718,
89
+ "train_steps_per_second": 0.107
90
  }
91
  ],
92
  "logging_steps": 5,
93
+ "max_steps": 50,
94
  "num_input_tokens_seen": 0,
95
+ "num_train_epochs": 6,
96
  "save_steps": 500,
97
  "stateful_callbacks": {
98
  "TrainerControl": {
 
106
  "attributes": {}
107
  }
108
  },
109
+ "total_flos": 2.5642849233567744e+16,
110
  "train_batch_size": 2,
111
  "trial_name": null,
112
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a98014c58d55a004fcf6f633921d0efe54b1543b60e7ac340ed1a1b1f40acb7
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfc8b2f6195776ee0127d015bd85f458c29181888c317a3105ff4995afd4007f
3
  size 5624