crossroderick commited on
Commit
3cf1937
·
1 Parent(s): 712ca44

Refined (v5.3) update

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +6 -4
  2. checkpoints/checkpoint-57000/trainer_state.json +0 -840
  3. checkpoints/checkpoint-57500/trainer_state.json +0 -847
  4. checkpoints/checkpoint-57726/trainer_state.json +0 -847
  5. checkpoints/{checkpoint-57000 → checkpoint-61500}/config.json +0 -0
  6. checkpoints/{checkpoint-57000 → checkpoint-61500}/generation_config.json +0 -0
  7. checkpoints/{checkpoint-57000 → checkpoint-61500}/model.safetensors +1 -1
  8. checkpoints/{checkpoint-57500 → checkpoint-61500}/optimizer.pt +1 -1
  9. checkpoints/{checkpoint-57726 → checkpoint-61500}/rng_state.pth +1 -1
  10. checkpoints/{checkpoint-57726 → checkpoint-61500}/scaler.pt +1 -1
  11. checkpoints/{checkpoint-57000 → checkpoint-61500}/scheduler.pt +1 -1
  12. checkpoints/{checkpoint-57000 → checkpoint-61500}/special_tokens_map.json +0 -0
  13. checkpoints/{checkpoint-57500 → checkpoint-61500}/spiece.model +2 -2
  14. checkpoints/{checkpoint-57000 → checkpoint-61500}/tokenizer.json +0 -0
  15. checkpoints/{checkpoint-57000 → checkpoint-61500}/tokenizer_config.json +0 -0
  16. checkpoints/checkpoint-61500/trainer_state.json +903 -0
  17. checkpoints/{checkpoint-57000 → checkpoint-61500}/training_args.bin +0 -0
  18. checkpoints/{checkpoint-57500 → checkpoint-62000}/config.json +0 -0
  19. checkpoints/{checkpoint-57500 → checkpoint-62000}/generation_config.json +0 -0
  20. checkpoints/{checkpoint-57500 → checkpoint-62000}/model.safetensors +1 -1
  21. checkpoints/{checkpoint-57000 → checkpoint-62000}/optimizer.pt +1 -1
  22. checkpoints/{checkpoint-57500 → checkpoint-62000}/rng_state.pth +1 -1
  23. checkpoints/{checkpoint-57500 → checkpoint-62000}/scaler.pt +1 -1
  24. checkpoints/{checkpoint-57726 → checkpoint-62000}/scheduler.pt +1 -1
  25. checkpoints/{checkpoint-57500 → checkpoint-62000}/special_tokens_map.json +0 -0
  26. checkpoints/{checkpoint-57000 → checkpoint-62000}/spiece.model +2 -2
  27. checkpoints/{checkpoint-57500 → checkpoint-62000}/tokenizer.json +0 -0
  28. checkpoints/{checkpoint-57500 → checkpoint-62000}/tokenizer_config.json +0 -0
  29. checkpoints/checkpoint-62000/trainer_state.json +910 -0
  30. checkpoints/{checkpoint-57500 → checkpoint-62000}/training_args.bin +0 -0
  31. checkpoints/{checkpoint-57726 → checkpoint-62164}/config.json +0 -0
  32. checkpoints/{checkpoint-57726 → checkpoint-62164}/generation_config.json +0 -0
  33. checkpoints/{checkpoint-57726 → checkpoint-62164}/model.safetensors +1 -1
  34. checkpoints/{checkpoint-57726 → checkpoint-62164}/optimizer.pt +1 -1
  35. checkpoints/{checkpoint-57000 → checkpoint-62164}/rng_state.pth +1 -1
  36. checkpoints/{checkpoint-57000 → checkpoint-62164}/scaler.pt +1 -1
  37. checkpoints/{checkpoint-57500 → checkpoint-62164}/scheduler.pt +1 -1
  38. checkpoints/{checkpoint-57726 → checkpoint-62164}/special_tokens_map.json +0 -0
  39. checkpoints/{checkpoint-57726 → checkpoint-62164}/spiece.model +2 -2
  40. checkpoints/{checkpoint-57726 → checkpoint-62164}/tokenizer.json +0 -0
  41. checkpoints/{checkpoint-57726 → checkpoint-62164}/tokenizer_config.json +0 -0
  42. checkpoints/checkpoint-62164/trainer_state.json +910 -0
  43. checkpoints/{checkpoint-57726 → checkpoint-62164}/training_args.bin +0 -0
  44. model.safetensors +1 -1
  45. spiece.model +2 -2
  46. src/data/generate_cyr_lat_pairs.py +1 -1
  47. src/tokeniser/dalat5_sp.model +2 -2
  48. src/tokeniser/dalat5_sp.vocab +0 -0
  49. src/tokeniser/spiece.model +2 -2
  50. src/train_tokeniser.py +1 -0
README.md CHANGED
@@ -24,10 +24,10 @@ model-index:
24
  metrics:
25
  - name: Training Loss
26
  type: loss
27
- value: 0.7541
28
  - name: Evaluation Loss
29
  type: loss
30
- value: 0.1166
31
  ---
32
  # DalaT5 - T5 Fine-Tuned on Cyrillic-to-Latin Kazakh 🇰🇿
33
 
@@ -139,9 +139,11 @@ KazParC деректер жинағын жүктеп алу үшін сізге
139
 
140
  * **DalaT5 v5**: 25 сәуірде дәл реттелген және сол күні қолжетімді болды. Қазақ кириллица және латын графикасын жақсырақ өңдеу үшін өзінің жеке токенизаторы бар ~1,9 миллион жазба (v4 сияқты) пайдаланылды / Fine-tuned on April 25 and made available on the same day. Used ~1.9 million records (like v4) with its own tokeniser to better handle the Kazakh Cyrillic and Latin scripts
141
 
142
- * **DalaT5 v5.1**: 25 сәуірде (v5 нұсқасынан кейін бірден) дәл реттелген және сол күні қолжетімді болды. Жақсырақ жалпылауды қамтамасыз ету үшін оқу үшін ~2,2 миллион жазба және токенизатор үшін 1 миллион жазба пайдаланылды / Fine-tuned on April 25 (immediately after v5) and made available on the same day. Used ~2.2 million records for training and 1 million records for the tokeniser to ensure better generalisation
143
 
144
- * **DalaT5 v5.2**: 26 сәуірде дәл реттелген және сол күні қолжетімді болды. Жалпы бірдей токенизатор құрылымын пайдаланды, бірақ оқыту үшін ~2,4 миллион жазбаны пайдаланды. Валидацияның жоғалуы да алғаш рет осы нұсқада қолжетімді болды / Fine-tuned on April 26 and made available on the same day. Used the same tokeniser structure overall, but leveraged ~2.4 million records for training. The evaluation loss was also made available with this version
 
 
145
 
146
  ---
147
 
 
24
  metrics:
25
  - name: Training Loss
26
  type: loss
27
+ value: 0.6684
28
  - name: Evaluation Loss
29
  type: loss
30
+ value: 0.0886
31
  ---
32
  # DalaT5 - T5 Fine-Tuned on Cyrillic-to-Latin Kazakh 🇰🇿
33
 
 
139
 
140
  * **DalaT5 v5**: 25 сәуірде дәл реттелген және сол күні қолжетімді болды. Қазақ кириллица және латын графикасын жақсырақ өңдеу үшін өзінің жеке токенизаторы бар ~1,9 миллион жазба (v4 сияқты) пайдаланылды / Fine-tuned on April 25 and made available on the same day. Used ~1.9 million records (like v4) with its own tokeniser to better handle the Kazakh Cyrillic and Latin scripts
141
 
142
+ * **DalaT5 v5.1**: 25 сәуірде (v5 нұсқасынан кейін бірден) дәл реттелген және сол күні қолжетімді болды. Жақсырақ жалпылауды қамтамасыз ету үшін жаттығу үшін ~2,2 миллион жазба және токенизатор үшін 1 миллион жазба пайдаланылды. v5-пен салыстырғанда галлюцинациялар күрт төмендеп, семантикалық түсіну одан әрі жақсарды / Fine-tuned on April 25 (immediately after v5) and made available on the same day. Used ~2.2 million records for training and 1 million records for the tokeniser to ensure better generalisation. Hallucinations decreased drastically when compared to v5, and semantic understanding was further enhanced
143
 
144
+ * **DalaT5 v5.2**: 25 сәуірде (v5 нұсқасынан кейін бірден) дәл реттелген және сол күні қолжетімді болды. Жақсырақ жалпылауды қамтамасыз ету үшін жаттығу үшін ~2,2 миллион жазба және токенизатор үшін 1 миллион жазба пайдаланылды. v5-пен салыстырғанда галлюцинациялар күрт төмендеп, семантикалық түсіну одан әрі жақсарды / Fine-tuned on April 26 and made available on the same day. Used the same tokeniser structure overall, but leveraged ~2.4 million records for training and evaluation. The evaluation loss was also made available with this version. Overall, hallucinations that made it through v5.1 were almost completely eliminated
145
+
146
+ * **DalaT5 v5.3**: 28 сәуірде дәл реттелген және сол күні қолжетімді болды. Жалпы, токенизаторға арналған ұлғайтылған максималды сөйлем өлшемінен басқа (4192 орнына 8384), ол v5.2 сияқты құрылымды пайдаланды. Бұл нұсқа одан да жақсы жалпылауды қамтамасыз ету үшін оқыту және бағалау үшін ~2,6 миллион жазбаны пайдаланды. Галлюцинацияның одан әрі азаюы байқалды және модель қазір қазақ морфологиясын өңдеуге шебер болған сияқты. / Fine-tuned on April 28 and made available on the same day. Overall, other than an increased maximum sentence size for the tokeniser (8384 instead of 4192), it used the same structure as v5.2. This version leveraged ~2.6 million records for training and evaluation to ensure even better generalisation. Further reduction of hallucinations was observed, and the model now seems to have become adept at handling Kazakh morphology
147
 
148
  ---
149
 
checkpoints/checkpoint-57000/trainer_state.json DELETED
@@ -1,840 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 1.9748466895333125,
6
- "eval_steps": 500,
7
- "global_step": 57000,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.01732321657485362,
14
- "grad_norm": 0.5299897193908691,
15
- "learning_rate": 4.956778574645741e-05,
16
- "loss": 2.1733,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.03464643314970724,
21
- "grad_norm": 0.4744114279747009,
22
- "learning_rate": 4.913470533208606e-05,
23
- "loss": 1.936,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.051969649724560855,
28
- "grad_norm": 0.5358501672744751,
29
- "learning_rate": 4.870162491771472e-05,
30
- "loss": 1.8605,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.06929286629941447,
35
- "grad_norm": 0.5026484131813049,
36
- "learning_rate": 4.826854450334338e-05,
37
- "loss": 1.793,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.0866160828742681,
42
- "grad_norm": 0.5730611681938171,
43
- "learning_rate": 4.7835464088972044e-05,
44
- "loss": 1.7309,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.10393929944912171,
49
- "grad_norm": 0.47230127453804016,
50
- "learning_rate": 4.74023836746007e-05,
51
- "loss": 1.6938,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.12126251602397534,
56
- "grad_norm": 0.6244428157806396,
57
- "learning_rate": 4.6969303260229364e-05,
58
- "loss": 1.6587,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 0.13858573259882895,
63
- "grad_norm": 0.5069366693496704,
64
- "learning_rate": 4.653622284585802e-05,
65
- "loss": 1.6241,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 0.15590894917368256,
70
- "grad_norm": 0.567051887512207,
71
- "learning_rate": 4.610314243148668e-05,
72
- "loss": 1.5789,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 0.1732321657485362,
77
- "grad_norm": 0.5564538836479187,
78
- "learning_rate": 4.567006201711534e-05,
79
- "loss": 1.5619,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 0.1905553823233898,
84
- "grad_norm": 0.7243916988372803,
85
- "learning_rate": 4.5236981602744e-05,
86
- "loss": 1.5358,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 0.20787859889824342,
91
- "grad_norm": 0.6585371494293213,
92
- "learning_rate": 4.480390118837266e-05,
93
- "loss": 1.5396,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 0.22520181547309703,
98
- "grad_norm": 0.5986710786819458,
99
- "learning_rate": 4.437082077400132e-05,
100
- "loss": 1.4991,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 0.24252503204795067,
105
- "grad_norm": 0.6701385974884033,
106
- "learning_rate": 4.393774035962998e-05,
107
- "loss": 1.4823,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 0.25984824862280426,
112
- "grad_norm": 0.6745362877845764,
113
- "learning_rate": 4.350465994525863e-05,
114
- "loss": 1.4715,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 0.2771714651976579,
119
- "grad_norm": 0.6930545568466187,
120
- "learning_rate": 4.30715795308873e-05,
121
- "loss": 1.4354,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 0.29449468177251154,
126
- "grad_norm": 0.6593695878982544,
127
- "learning_rate": 4.2638499116515954e-05,
128
- "loss": 1.432,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 0.3118178983473651,
133
- "grad_norm": 0.6918864846229553,
134
- "learning_rate": 4.220541870214462e-05,
135
- "loss": 1.4209,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 0.32914111492221876,
140
- "grad_norm": 0.7154921889305115,
141
- "learning_rate": 4.1772338287773275e-05,
142
- "loss": 1.391,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 0.3464643314970724,
147
- "grad_norm": 0.8634065389633179,
148
- "learning_rate": 4.133925787340194e-05,
149
- "loss": 1.3843,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 0.363787548071926,
154
- "grad_norm": 0.8820887804031372,
155
- "learning_rate": 4.0906177459030595e-05,
156
- "loss": 1.3612,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 0.3811107646467796,
161
- "grad_norm": 0.9643932580947876,
162
- "learning_rate": 4.047309704465925e-05,
163
- "loss": 1.3326,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 0.39843398122163326,
168
- "grad_norm": 0.9170373678207397,
169
- "learning_rate": 4.0040016630287916e-05,
170
- "loss": 1.3158,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 0.41575719779648684,
175
- "grad_norm": 0.8911833167076111,
176
- "learning_rate": 3.960693621591657e-05,
177
- "loss": 1.2982,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 0.4330804143713405,
182
- "grad_norm": 1.3023812770843506,
183
- "learning_rate": 3.917472196237398e-05,
184
- "loss": 1.2497,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 0.45040363094619407,
189
- "grad_norm": 1.3367282152175903,
190
- "learning_rate": 3.874164154800263e-05,
191
- "loss": 1.2436,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 0.4677268475210477,
196
- "grad_norm": 1.070709228515625,
197
- "learning_rate": 3.8308561133631293e-05,
198
- "loss": 1.2068,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 0.48505006409590135,
203
- "grad_norm": 1.2574782371520996,
204
- "learning_rate": 3.787548071925995e-05,
205
- "loss": 1.1867,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 0.5023732806707549,
210
- "grad_norm": 1.4889111518859863,
211
- "learning_rate": 3.744326646571736e-05,
212
- "loss": 1.1908,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 0.5196964972456085,
217
- "grad_norm": 1.2556302547454834,
218
- "learning_rate": 3.7010186051346014e-05,
219
- "loss": 1.1476,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 0.5370197138204622,
224
- "grad_norm": 1.2582406997680664,
225
- "learning_rate": 3.6577971797803415e-05,
226
- "loss": 1.1268,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 0.5543429303953158,
231
- "grad_norm": 1.4035013914108276,
232
- "learning_rate": 3.614575754426082e-05,
233
- "loss": 1.119,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 0.5716661469701694,
238
- "grad_norm": 1.7117546796798706,
239
- "learning_rate": 3.571267712988948e-05,
240
- "loss": 1.106,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 0.5889893635450231,
245
- "grad_norm": 1.503953218460083,
246
- "learning_rate": 3.5279596715518135e-05,
247
- "loss": 1.08,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 0.6063125801198767,
252
- "grad_norm": 1.6753602027893066,
253
- "learning_rate": 3.48465163011468e-05,
254
- "loss": 1.055,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 0.6236357966947302,
259
- "grad_norm": 1.5606533288955688,
260
- "learning_rate": 3.4413435886775456e-05,
261
- "loss": 1.043,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 0.6409590132695839,
266
- "grad_norm": 2.1966652870178223,
267
- "learning_rate": 3.398122163323286e-05,
268
- "loss": 1.0294,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 0.6582822298444375,
273
- "grad_norm": 2.0083296298980713,
274
- "learning_rate": 3.354814121886152e-05,
275
- "loss": 1.009,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 0.6756054464192911,
280
- "grad_norm": 1.61182701587677,
281
- "learning_rate": 3.311506080449018e-05,
282
- "loss": 0.978,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 0.6929286629941448,
287
- "grad_norm": 1.698030710220337,
288
- "learning_rate": 3.268198039011884e-05,
289
- "loss": 0.9798,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 0.7102518795689984,
294
- "grad_norm": 1.810142993927002,
295
- "learning_rate": 3.224976613657624e-05,
296
- "loss": 0.9558,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 0.727575096143852,
301
- "grad_norm": 4.462728023529053,
302
- "learning_rate": 3.1816685722204904e-05,
303
- "loss": 0.9337,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 0.7448983127187057,
308
- "grad_norm": 3.5566582679748535,
309
- "learning_rate": 3.1384471468662305e-05,
310
- "loss": 0.9115,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 0.7622215292935592,
315
- "grad_norm": 3.2824957370758057,
316
- "learning_rate": 3.095139105429096e-05,
317
- "loss": 0.8992,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 0.7795447458684128,
322
- "grad_norm": 3.1051690578460693,
323
- "learning_rate": 3.0518310639919625e-05,
324
- "loss": 0.8945,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 0.7968679624432665,
329
- "grad_norm": 5.406064510345459,
330
- "learning_rate": 3.0085230225548282e-05,
331
- "loss": 0.8544,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 0.8141911790181201,
336
- "grad_norm": 3.3169949054718018,
337
- "learning_rate": 2.9653015972005682e-05,
338
- "loss": 0.8494,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 0.8315143955929737,
343
- "grad_norm": 2.4768149852752686,
344
- "learning_rate": 2.9219935557634343e-05,
345
- "loss": 0.8145,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 0.8488376121678273,
350
- "grad_norm": 2.6160104274749756,
351
- "learning_rate": 2.8786855143263003e-05,
352
- "loss": 0.7995,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 0.866160828742681,
357
- "grad_norm": 2.7828731536865234,
358
- "learning_rate": 2.8353774728891663e-05,
359
- "loss": 0.7705,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 0.8834840453175345,
364
- "grad_norm": 3.508859157562256,
365
- "learning_rate": 2.7921560475349063e-05,
366
- "loss": 0.7565,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 0.9008072618923881,
371
- "grad_norm": 2.672107219696045,
372
- "learning_rate": 2.7488480060977724e-05,
373
- "loss": 0.7394,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 0.9181304784672418,
378
- "grad_norm": 2.712963819503784,
379
- "learning_rate": 2.7055399646606384e-05,
380
- "loss": 0.7144,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 0.9354536950420954,
385
- "grad_norm": 4.505606651306152,
386
- "learning_rate": 2.6622319232235044e-05,
387
- "loss": 0.6964,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 0.952776911616949,
392
- "grad_norm": 1.9717707633972168,
393
- "learning_rate": 2.619010497869244e-05,
394
- "loss": 0.6642,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 0.9701001281918027,
399
- "grad_norm": 2.501847743988037,
400
- "learning_rate": 2.57570245643211e-05,
401
- "loss": 0.6542,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 0.9874233447666563,
406
- "grad_norm": 4.659865379333496,
407
- "learning_rate": 2.532481031077851e-05,
408
- "loss": 0.622,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 1.0,
413
- "eval_loss": 0.3914269506931305,
414
- "eval_runtime": 1756.0029,
415
- "eval_samples_per_second": 350.648,
416
- "eval_steps_per_second": 43.831,
417
- "step": 28863
418
- },
419
- {
420
- "epoch": 1.0047465613415099,
421
- "grad_norm": 1.5512120723724365,
422
- "learning_rate": 2.4891729896407165e-05,
423
- "loss": 0.6,
424
- "step": 29000
425
- },
426
- {
427
- "epoch": 1.0220697779163634,
428
- "grad_norm": 2.7224926948547363,
429
- "learning_rate": 2.4458649482035826e-05,
430
- "loss": 0.5733,
431
- "step": 29500
432
- },
433
- {
434
- "epoch": 1.039392994491217,
435
- "grad_norm": 2.0893383026123047,
436
- "learning_rate": 2.4025569067664486e-05,
437
- "loss": 0.5517,
438
- "step": 30000
439
- },
440
- {
441
- "epoch": 1.0567162110660708,
442
- "grad_norm": 2.235480785369873,
443
- "learning_rate": 2.3592488653293143e-05,
444
- "loss": 0.5389,
445
- "step": 30500
446
- },
447
- {
448
- "epoch": 1.0740394276409244,
449
- "grad_norm": 2.4926114082336426,
450
- "learning_rate": 2.3159408238921803e-05,
451
- "loss": 0.5282,
452
- "step": 31000
453
- },
454
- {
455
- "epoch": 1.091362644215778,
456
- "grad_norm": 1.9329782724380493,
457
- "learning_rate": 2.2726327824550464e-05,
458
- "loss": 0.5076,
459
- "step": 31500
460
- },
461
- {
462
- "epoch": 1.1086858607906316,
463
- "grad_norm": 1.8705673217773438,
464
- "learning_rate": 2.2293247410179124e-05,
465
- "loss": 0.489,
466
- "step": 32000
467
- },
468
- {
469
- "epoch": 1.1260090773654852,
470
- "grad_norm": 1.8372488021850586,
471
- "learning_rate": 2.186016699580778e-05,
472
- "loss": 0.4701,
473
- "step": 32500
474
- },
475
- {
476
- "epoch": 1.1433322939403388,
477
- "grad_norm": 1.7131813764572144,
478
- "learning_rate": 2.142708658143644e-05,
479
- "loss": 0.4592,
480
- "step": 33000
481
- },
482
- {
483
- "epoch": 1.1606555105151926,
484
- "grad_norm": 3.1449172496795654,
485
- "learning_rate": 2.0994872327893845e-05,
486
- "loss": 0.4408,
487
- "step": 33500
488
- },
489
- {
490
- "epoch": 1.1779787270900461,
491
- "grad_norm": 2.199096918106079,
492
- "learning_rate": 2.0561791913522505e-05,
493
- "loss": 0.4278,
494
- "step": 34000
495
- },
496
- {
497
- "epoch": 1.1953019436648997,
498
- "grad_norm": 1.6359717845916748,
499
- "learning_rate": 2.0128711499151162e-05,
500
- "loss": 0.4238,
501
- "step": 34500
502
- },
503
- {
504
- "epoch": 1.2126251602397533,
505
- "grad_norm": 1.7747199535369873,
506
- "learning_rate": 1.9695631084779822e-05,
507
- "loss": 0.4024,
508
- "step": 35000
509
- },
510
- {
511
- "epoch": 1.229948376814607,
512
- "grad_norm": 2.073904037475586,
513
- "learning_rate": 1.9263416831237226e-05,
514
- "loss": 0.39,
515
- "step": 35500
516
- },
517
- {
518
- "epoch": 1.2472715933894605,
519
- "grad_norm": 1.9012521505355835,
520
- "learning_rate": 1.8830336416865886e-05,
521
- "loss": 0.3856,
522
- "step": 36000
523
- },
524
- {
525
- "epoch": 1.264594809964314,
526
- "grad_norm": 2.2919228076934814,
527
- "learning_rate": 1.8397256002494546e-05,
528
- "loss": 0.3729,
529
- "step": 36500
530
- },
531
- {
532
- "epoch": 1.2819180265391679,
533
- "grad_norm": 1.933583378791809,
534
- "learning_rate": 1.7964175588123203e-05,
535
- "loss": 0.3584,
536
- "step": 37000
537
- },
538
- {
539
- "epoch": 1.2992412431140214,
540
- "grad_norm": 2.1168253421783447,
541
- "learning_rate": 1.7531095173751864e-05,
542
- "loss": 0.356,
543
- "step": 37500
544
- },
545
- {
546
- "epoch": 1.316564459688875,
547
- "grad_norm": 1.7472686767578125,
548
- "learning_rate": 1.7098014759380524e-05,
549
- "loss": 0.339,
550
- "step": 38000
551
- },
552
- {
553
- "epoch": 1.3338876762637286,
554
- "grad_norm": 1.9178825616836548,
555
- "learning_rate": 1.666493434500918e-05,
556
- "loss": 0.3294,
557
- "step": 38500
558
- },
559
- {
560
- "epoch": 1.3512108928385822,
561
- "grad_norm": 1.4763795137405396,
562
- "learning_rate": 1.623185393063784e-05,
563
- "loss": 0.3299,
564
- "step": 39000
565
- },
566
- {
567
- "epoch": 1.368534109413436,
568
- "grad_norm": 1.974442481994629,
569
- "learning_rate": 1.57987735162665e-05,
570
- "loss": 0.3206,
571
- "step": 39500
572
- },
573
- {
574
- "epoch": 1.3858573259882894,
575
- "grad_norm": 1.315453052520752,
576
- "learning_rate": 1.5366559262723905e-05,
577
- "loss": 0.313,
578
- "step": 40000
579
- },
580
- {
581
- "epoch": 1.4031805425631432,
582
- "grad_norm": 1.605042815208435,
583
- "learning_rate": 1.4934345009181305e-05,
584
- "loss": 0.2998,
585
- "step": 40500
586
- },
587
- {
588
- "epoch": 1.4205037591379968,
589
- "grad_norm": 1.5163339376449585,
590
- "learning_rate": 1.4501264594809966e-05,
591
- "loss": 0.3031,
592
- "step": 41000
593
- },
594
- {
595
- "epoch": 1.4378269757128503,
596
- "grad_norm": 1.9643980264663696,
597
- "learning_rate": 1.4068184180438623e-05,
598
- "loss": 0.2922,
599
- "step": 41500
600
- },
601
- {
602
- "epoch": 1.455150192287704,
603
- "grad_norm": 1.4320045709609985,
604
- "learning_rate": 1.3635103766067284e-05,
605
- "loss": 0.2858,
606
- "step": 42000
607
- },
608
- {
609
- "epoch": 1.4724734088625575,
610
- "grad_norm": 1.4415267705917358,
611
- "learning_rate": 1.3202889512524685e-05,
612
- "loss": 0.2807,
613
- "step": 42500
614
- },
615
- {
616
- "epoch": 1.4897966254374113,
617
- "grad_norm": 1.1676790714263916,
618
- "learning_rate": 1.2769809098153345e-05,
619
- "loss": 0.279,
620
- "step": 43000
621
- },
622
- {
623
- "epoch": 1.5071198420122647,
624
- "grad_norm": 1.2914575338363647,
625
- "learning_rate": 1.2336728683782005e-05,
626
- "loss": 0.2658,
627
- "step": 43500
628
- },
629
- {
630
- "epoch": 1.5244430585871185,
631
- "grad_norm": 1.8342537879943848,
632
- "learning_rate": 1.1903648269410666e-05,
633
- "loss": 0.2669,
634
- "step": 44000
635
- },
636
- {
637
- "epoch": 1.541766275161972,
638
- "grad_norm": 1.6250979900360107,
639
- "learning_rate": 1.1471434015868068e-05,
640
- "loss": 0.2615,
641
- "step": 44500
642
- },
643
- {
644
- "epoch": 1.5590894917368257,
645
- "grad_norm": 1.8935564756393433,
646
- "learning_rate": 1.1038353601496726e-05,
647
- "loss": 0.2513,
648
- "step": 45000
649
- },
650
- {
651
- "epoch": 1.5764127083116795,
652
- "grad_norm": 1.7190736532211304,
653
- "learning_rate": 1.0605273187125386e-05,
654
- "loss": 0.2488,
655
- "step": 45500
656
- },
657
- {
658
- "epoch": 1.5937359248865328,
659
- "grad_norm": 1.5352853536605835,
660
- "learning_rate": 1.0172192772754045e-05,
661
- "loss": 0.2502,
662
- "step": 46000
663
- },
664
- {
665
- "epoch": 1.6110591414613866,
666
- "grad_norm": 1.452185869216919,
667
- "learning_rate": 9.739112358382704e-06,
668
- "loss": 0.2441,
669
- "step": 46500
670
- },
671
- {
672
- "epoch": 1.6283823580362402,
673
- "grad_norm": 1.2263524532318115,
674
- "learning_rate": 9.306031944011364e-06,
675
- "loss": 0.2366,
676
- "step": 47000
677
- },
678
- {
679
- "epoch": 1.6457055746110938,
680
- "grad_norm": 1.7002058029174805,
681
- "learning_rate": 8.872951529640024e-06,
682
- "loss": 0.2361,
683
- "step": 47500
684
- },
685
- {
686
- "epoch": 1.6630287911859474,
687
- "grad_norm": 0.8775345087051392,
688
- "learning_rate": 8.439871115268685e-06,
689
- "loss": 0.2314,
690
- "step": 48000
691
- },
692
- {
693
- "epoch": 1.680352007760801,
694
- "grad_norm": 1.68264639377594,
695
- "learning_rate": 8.007656861726086e-06,
696
- "loss": 0.2341,
697
- "step": 48500
698
- },
699
- {
700
- "epoch": 1.6976752243356548,
701
- "grad_norm": 1.4811515808105469,
702
- "learning_rate": 7.574576447354745e-06,
703
- "loss": 0.2238,
704
- "step": 49000
705
- },
706
- {
707
- "epoch": 1.7149984409105081,
708
- "grad_norm": 1.5231512784957886,
709
- "learning_rate": 7.142362193812148e-06,
710
- "loss": 0.2229,
711
- "step": 49500
712
- },
713
- {
714
- "epoch": 1.732321657485362,
715
- "grad_norm": 1.870348334312439,
716
- "learning_rate": 6.709281779440807e-06,
717
- "loss": 0.2211,
718
- "step": 50000
719
- },
720
- {
721
- "epoch": 1.7496448740602155,
722
- "grad_norm": 1.2438400983810425,
723
- "learning_rate": 6.276201365069466e-06,
724
- "loss": 0.2205,
725
- "step": 50500
726
- },
727
- {
728
- "epoch": 1.766968090635069,
729
- "grad_norm": 1.426330327987671,
730
- "learning_rate": 5.843120950698125e-06,
731
- "loss": 0.2189,
732
- "step": 51000
733
- },
734
- {
735
- "epoch": 1.7842913072099227,
736
- "grad_norm": 0.9976692199707031,
737
- "learning_rate": 5.410040536326786e-06,
738
- "loss": 0.2168,
739
- "step": 51500
740
- },
741
- {
742
- "epoch": 1.8016145237847763,
743
- "grad_norm": 0.7628383040428162,
744
- "learning_rate": 4.976960121955445e-06,
745
- "loss": 0.2137,
746
- "step": 52000
747
- },
748
- {
749
- "epoch": 1.81893774035963,
750
- "grad_norm": 1.1943764686584473,
751
- "learning_rate": 4.543879707584105e-06,
752
- "loss": 0.2108,
753
- "step": 52500
754
- },
755
- {
756
- "epoch": 1.8362609569344834,
757
- "grad_norm": 1.6423786878585815,
758
- "learning_rate": 4.110799293212764e-06,
759
- "loss": 0.207,
760
- "step": 53000
761
- },
762
- {
763
- "epoch": 1.8535841735093372,
764
- "grad_norm": 1.1439303159713745,
765
- "learning_rate": 3.678585039670166e-06,
766
- "loss": 0.2102,
767
- "step": 53500
768
- },
769
- {
770
- "epoch": 1.8709073900841908,
771
- "grad_norm": 1.1228655576705933,
772
- "learning_rate": 3.2463707861275687e-06,
773
- "loss": 0.2078,
774
- "step": 54000
775
- },
776
- {
777
- "epoch": 1.8882306066590444,
778
- "grad_norm": 1.6855107545852661,
779
- "learning_rate": 2.8132903717562278e-06,
780
- "loss": 0.2095,
781
- "step": 54500
782
- },
783
- {
784
- "epoch": 1.9055538232338982,
785
- "grad_norm": 1.0492366552352905,
786
- "learning_rate": 2.3802099573848872e-06,
787
- "loss": 0.2059,
788
- "step": 55000
789
- },
790
- {
791
- "epoch": 1.9228770398087516,
792
- "grad_norm": 1.1293083429336548,
793
- "learning_rate": 1.947129543013547e-06,
794
- "loss": 0.2084,
795
- "step": 55500
796
- },
797
- {
798
- "epoch": 1.9402002563836054,
799
- "grad_norm": 0.6616119742393494,
800
- "learning_rate": 1.5140491286422063e-06,
801
- "loss": 0.2004,
802
- "step": 56000
803
- },
804
- {
805
- "epoch": 1.957523472958459,
806
- "grad_norm": 1.435498595237732,
807
- "learning_rate": 1.0809687142708658e-06,
808
- "loss": 0.2,
809
- "step": 56500
810
- },
811
- {
812
- "epoch": 1.9748466895333125,
813
- "grad_norm": 1.5348039865493774,
814
- "learning_rate": 6.478882998995254e-07,
815
- "loss": 0.203,
816
- "step": 57000
817
- }
818
- ],
819
- "logging_steps": 500,
820
- "max_steps": 57726,
821
- "num_input_tokens_seen": 0,
822
- "num_train_epochs": 2,
823
- "save_steps": 500,
824
- "stateful_callbacks": {
825
- "TrainerControl": {
826
- "args": {
827
- "should_epoch_stop": false,
828
- "should_evaluate": false,
829
- "should_log": false,
830
- "should_save": true,
831
- "should_training_stop": false
832
- },
833
- "attributes": {}
834
- }
835
- },
836
- "total_flos": 1.2343118157525811e+17,
837
- "train_batch_size": 32,
838
- "trial_name": null,
839
- "trial_params": null
840
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-57500/trainer_state.json DELETED
@@ -1,847 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 1.9921699061081661,
6
- "eval_steps": 500,
7
- "global_step": 57500,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.01732321657485362,
14
- "grad_norm": 0.5299897193908691,
15
- "learning_rate": 4.956778574645741e-05,
16
- "loss": 2.1733,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.03464643314970724,
21
- "grad_norm": 0.4744114279747009,
22
- "learning_rate": 4.913470533208606e-05,
23
- "loss": 1.936,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.051969649724560855,
28
- "grad_norm": 0.5358501672744751,
29
- "learning_rate": 4.870162491771472e-05,
30
- "loss": 1.8605,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.06929286629941447,
35
- "grad_norm": 0.5026484131813049,
36
- "learning_rate": 4.826854450334338e-05,
37
- "loss": 1.793,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.0866160828742681,
42
- "grad_norm": 0.5730611681938171,
43
- "learning_rate": 4.7835464088972044e-05,
44
- "loss": 1.7309,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.10393929944912171,
49
- "grad_norm": 0.47230127453804016,
50
- "learning_rate": 4.74023836746007e-05,
51
- "loss": 1.6938,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.12126251602397534,
56
- "grad_norm": 0.6244428157806396,
57
- "learning_rate": 4.6969303260229364e-05,
58
- "loss": 1.6587,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 0.13858573259882895,
63
- "grad_norm": 0.5069366693496704,
64
- "learning_rate": 4.653622284585802e-05,
65
- "loss": 1.6241,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 0.15590894917368256,
70
- "grad_norm": 0.567051887512207,
71
- "learning_rate": 4.610314243148668e-05,
72
- "loss": 1.5789,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 0.1732321657485362,
77
- "grad_norm": 0.5564538836479187,
78
- "learning_rate": 4.567006201711534e-05,
79
- "loss": 1.5619,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 0.1905553823233898,
84
- "grad_norm": 0.7243916988372803,
85
- "learning_rate": 4.5236981602744e-05,
86
- "loss": 1.5358,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 0.20787859889824342,
91
- "grad_norm": 0.6585371494293213,
92
- "learning_rate": 4.480390118837266e-05,
93
- "loss": 1.5396,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 0.22520181547309703,
98
- "grad_norm": 0.5986710786819458,
99
- "learning_rate": 4.437082077400132e-05,
100
- "loss": 1.4991,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 0.24252503204795067,
105
- "grad_norm": 0.6701385974884033,
106
- "learning_rate": 4.393774035962998e-05,
107
- "loss": 1.4823,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 0.25984824862280426,
112
- "grad_norm": 0.6745362877845764,
113
- "learning_rate": 4.350465994525863e-05,
114
- "loss": 1.4715,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 0.2771714651976579,
119
- "grad_norm": 0.6930545568466187,
120
- "learning_rate": 4.30715795308873e-05,
121
- "loss": 1.4354,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 0.29449468177251154,
126
- "grad_norm": 0.6593695878982544,
127
- "learning_rate": 4.2638499116515954e-05,
128
- "loss": 1.432,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 0.3118178983473651,
133
- "grad_norm": 0.6918864846229553,
134
- "learning_rate": 4.220541870214462e-05,
135
- "loss": 1.4209,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 0.32914111492221876,
140
- "grad_norm": 0.7154921889305115,
141
- "learning_rate": 4.1772338287773275e-05,
142
- "loss": 1.391,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 0.3464643314970724,
147
- "grad_norm": 0.8634065389633179,
148
- "learning_rate": 4.133925787340194e-05,
149
- "loss": 1.3843,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 0.363787548071926,
154
- "grad_norm": 0.8820887804031372,
155
- "learning_rate": 4.0906177459030595e-05,
156
- "loss": 1.3612,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 0.3811107646467796,
161
- "grad_norm": 0.9643932580947876,
162
- "learning_rate": 4.047309704465925e-05,
163
- "loss": 1.3326,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 0.39843398122163326,
168
- "grad_norm": 0.9170373678207397,
169
- "learning_rate": 4.0040016630287916e-05,
170
- "loss": 1.3158,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 0.41575719779648684,
175
- "grad_norm": 0.8911833167076111,
176
- "learning_rate": 3.960693621591657e-05,
177
- "loss": 1.2982,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 0.4330804143713405,
182
- "grad_norm": 1.3023812770843506,
183
- "learning_rate": 3.917472196237398e-05,
184
- "loss": 1.2497,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 0.45040363094619407,
189
- "grad_norm": 1.3367282152175903,
190
- "learning_rate": 3.874164154800263e-05,
191
- "loss": 1.2436,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 0.4677268475210477,
196
- "grad_norm": 1.070709228515625,
197
- "learning_rate": 3.8308561133631293e-05,
198
- "loss": 1.2068,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 0.48505006409590135,
203
- "grad_norm": 1.2574782371520996,
204
- "learning_rate": 3.787548071925995e-05,
205
- "loss": 1.1867,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 0.5023732806707549,
210
- "grad_norm": 1.4889111518859863,
211
- "learning_rate": 3.744326646571736e-05,
212
- "loss": 1.1908,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 0.5196964972456085,
217
- "grad_norm": 1.2556302547454834,
218
- "learning_rate": 3.7010186051346014e-05,
219
- "loss": 1.1476,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 0.5370197138204622,
224
- "grad_norm": 1.2582406997680664,
225
- "learning_rate": 3.6577971797803415e-05,
226
- "loss": 1.1268,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 0.5543429303953158,
231
- "grad_norm": 1.4035013914108276,
232
- "learning_rate": 3.614575754426082e-05,
233
- "loss": 1.119,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 0.5716661469701694,
238
- "grad_norm": 1.7117546796798706,
239
- "learning_rate": 3.571267712988948e-05,
240
- "loss": 1.106,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 0.5889893635450231,
245
- "grad_norm": 1.503953218460083,
246
- "learning_rate": 3.5279596715518135e-05,
247
- "loss": 1.08,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 0.6063125801198767,
252
- "grad_norm": 1.6753602027893066,
253
- "learning_rate": 3.48465163011468e-05,
254
- "loss": 1.055,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 0.6236357966947302,
259
- "grad_norm": 1.5606533288955688,
260
- "learning_rate": 3.4413435886775456e-05,
261
- "loss": 1.043,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 0.6409590132695839,
266
- "grad_norm": 2.1966652870178223,
267
- "learning_rate": 3.398122163323286e-05,
268
- "loss": 1.0294,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 0.6582822298444375,
273
- "grad_norm": 2.0083296298980713,
274
- "learning_rate": 3.354814121886152e-05,
275
- "loss": 1.009,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 0.6756054464192911,
280
- "grad_norm": 1.61182701587677,
281
- "learning_rate": 3.311506080449018e-05,
282
- "loss": 0.978,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 0.6929286629941448,
287
- "grad_norm": 1.698030710220337,
288
- "learning_rate": 3.268198039011884e-05,
289
- "loss": 0.9798,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 0.7102518795689984,
294
- "grad_norm": 1.810142993927002,
295
- "learning_rate": 3.224976613657624e-05,
296
- "loss": 0.9558,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 0.727575096143852,
301
- "grad_norm": 4.462728023529053,
302
- "learning_rate": 3.1816685722204904e-05,
303
- "loss": 0.9337,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 0.7448983127187057,
308
- "grad_norm": 3.5566582679748535,
309
- "learning_rate": 3.1384471468662305e-05,
310
- "loss": 0.9115,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 0.7622215292935592,
315
- "grad_norm": 3.2824957370758057,
316
- "learning_rate": 3.095139105429096e-05,
317
- "loss": 0.8992,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 0.7795447458684128,
322
- "grad_norm": 3.1051690578460693,
323
- "learning_rate": 3.0518310639919625e-05,
324
- "loss": 0.8945,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 0.7968679624432665,
329
- "grad_norm": 5.406064510345459,
330
- "learning_rate": 3.0085230225548282e-05,
331
- "loss": 0.8544,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 0.8141911790181201,
336
- "grad_norm": 3.3169949054718018,
337
- "learning_rate": 2.9653015972005682e-05,
338
- "loss": 0.8494,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 0.8315143955929737,
343
- "grad_norm": 2.4768149852752686,
344
- "learning_rate": 2.9219935557634343e-05,
345
- "loss": 0.8145,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 0.8488376121678273,
350
- "grad_norm": 2.6160104274749756,
351
- "learning_rate": 2.8786855143263003e-05,
352
- "loss": 0.7995,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 0.866160828742681,
357
- "grad_norm": 2.7828731536865234,
358
- "learning_rate": 2.8353774728891663e-05,
359
- "loss": 0.7705,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 0.8834840453175345,
364
- "grad_norm": 3.508859157562256,
365
- "learning_rate": 2.7921560475349063e-05,
366
- "loss": 0.7565,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 0.9008072618923881,
371
- "grad_norm": 2.672107219696045,
372
- "learning_rate": 2.7488480060977724e-05,
373
- "loss": 0.7394,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 0.9181304784672418,
378
- "grad_norm": 2.712963819503784,
379
- "learning_rate": 2.7055399646606384e-05,
380
- "loss": 0.7144,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 0.9354536950420954,
385
- "grad_norm": 4.505606651306152,
386
- "learning_rate": 2.6622319232235044e-05,
387
- "loss": 0.6964,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 0.952776911616949,
392
- "grad_norm": 1.9717707633972168,
393
- "learning_rate": 2.619010497869244e-05,
394
- "loss": 0.6642,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 0.9701001281918027,
399
- "grad_norm": 2.501847743988037,
400
- "learning_rate": 2.57570245643211e-05,
401
- "loss": 0.6542,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 0.9874233447666563,
406
- "grad_norm": 4.659865379333496,
407
- "learning_rate": 2.532481031077851e-05,
408
- "loss": 0.622,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 1.0,
413
- "eval_loss": 0.3914269506931305,
414
- "eval_runtime": 1756.0029,
415
- "eval_samples_per_second": 350.648,
416
- "eval_steps_per_second": 43.831,
417
- "step": 28863
418
- },
419
- {
420
- "epoch": 1.0047465613415099,
421
- "grad_norm": 1.5512120723724365,
422
- "learning_rate": 2.4891729896407165e-05,
423
- "loss": 0.6,
424
- "step": 29000
425
- },
426
- {
427
- "epoch": 1.0220697779163634,
428
- "grad_norm": 2.7224926948547363,
429
- "learning_rate": 2.4458649482035826e-05,
430
- "loss": 0.5733,
431
- "step": 29500
432
- },
433
- {
434
- "epoch": 1.039392994491217,
435
- "grad_norm": 2.0893383026123047,
436
- "learning_rate": 2.4025569067664486e-05,
437
- "loss": 0.5517,
438
- "step": 30000
439
- },
440
- {
441
- "epoch": 1.0567162110660708,
442
- "grad_norm": 2.235480785369873,
443
- "learning_rate": 2.3592488653293143e-05,
444
- "loss": 0.5389,
445
- "step": 30500
446
- },
447
- {
448
- "epoch": 1.0740394276409244,
449
- "grad_norm": 2.4926114082336426,
450
- "learning_rate": 2.3159408238921803e-05,
451
- "loss": 0.5282,
452
- "step": 31000
453
- },
454
- {
455
- "epoch": 1.091362644215778,
456
- "grad_norm": 1.9329782724380493,
457
- "learning_rate": 2.2726327824550464e-05,
458
- "loss": 0.5076,
459
- "step": 31500
460
- },
461
- {
462
- "epoch": 1.1086858607906316,
463
- "grad_norm": 1.8705673217773438,
464
- "learning_rate": 2.2293247410179124e-05,
465
- "loss": 0.489,
466
- "step": 32000
467
- },
468
- {
469
- "epoch": 1.1260090773654852,
470
- "grad_norm": 1.8372488021850586,
471
- "learning_rate": 2.186016699580778e-05,
472
- "loss": 0.4701,
473
- "step": 32500
474
- },
475
- {
476
- "epoch": 1.1433322939403388,
477
- "grad_norm": 1.7131813764572144,
478
- "learning_rate": 2.142708658143644e-05,
479
- "loss": 0.4592,
480
- "step": 33000
481
- },
482
- {
483
- "epoch": 1.1606555105151926,
484
- "grad_norm": 3.1449172496795654,
485
- "learning_rate": 2.0994872327893845e-05,
486
- "loss": 0.4408,
487
- "step": 33500
488
- },
489
- {
490
- "epoch": 1.1779787270900461,
491
- "grad_norm": 2.199096918106079,
492
- "learning_rate": 2.0561791913522505e-05,
493
- "loss": 0.4278,
494
- "step": 34000
495
- },
496
- {
497
- "epoch": 1.1953019436648997,
498
- "grad_norm": 1.6359717845916748,
499
- "learning_rate": 2.0128711499151162e-05,
500
- "loss": 0.4238,
501
- "step": 34500
502
- },
503
- {
504
- "epoch": 1.2126251602397533,
505
- "grad_norm": 1.7747199535369873,
506
- "learning_rate": 1.9695631084779822e-05,
507
- "loss": 0.4024,
508
- "step": 35000
509
- },
510
- {
511
- "epoch": 1.229948376814607,
512
- "grad_norm": 2.073904037475586,
513
- "learning_rate": 1.9263416831237226e-05,
514
- "loss": 0.39,
515
- "step": 35500
516
- },
517
- {
518
- "epoch": 1.2472715933894605,
519
- "grad_norm": 1.9012521505355835,
520
- "learning_rate": 1.8830336416865886e-05,
521
- "loss": 0.3856,
522
- "step": 36000
523
- },
524
- {
525
- "epoch": 1.264594809964314,
526
- "grad_norm": 2.2919228076934814,
527
- "learning_rate": 1.8397256002494546e-05,
528
- "loss": 0.3729,
529
- "step": 36500
530
- },
531
- {
532
- "epoch": 1.2819180265391679,
533
- "grad_norm": 1.933583378791809,
534
- "learning_rate": 1.7964175588123203e-05,
535
- "loss": 0.3584,
536
- "step": 37000
537
- },
538
- {
539
- "epoch": 1.2992412431140214,
540
- "grad_norm": 2.1168253421783447,
541
- "learning_rate": 1.7531095173751864e-05,
542
- "loss": 0.356,
543
- "step": 37500
544
- },
545
- {
546
- "epoch": 1.316564459688875,
547
- "grad_norm": 1.7472686767578125,
548
- "learning_rate": 1.7098014759380524e-05,
549
- "loss": 0.339,
550
- "step": 38000
551
- },
552
- {
553
- "epoch": 1.3338876762637286,
554
- "grad_norm": 1.9178825616836548,
555
- "learning_rate": 1.666493434500918e-05,
556
- "loss": 0.3294,
557
- "step": 38500
558
- },
559
- {
560
- "epoch": 1.3512108928385822,
561
- "grad_norm": 1.4763795137405396,
562
- "learning_rate": 1.623185393063784e-05,
563
- "loss": 0.3299,
564
- "step": 39000
565
- },
566
- {
567
- "epoch": 1.368534109413436,
568
- "grad_norm": 1.974442481994629,
569
- "learning_rate": 1.57987735162665e-05,
570
- "loss": 0.3206,
571
- "step": 39500
572
- },
573
- {
574
- "epoch": 1.3858573259882894,
575
- "grad_norm": 1.315453052520752,
576
- "learning_rate": 1.5366559262723905e-05,
577
- "loss": 0.313,
578
- "step": 40000
579
- },
580
- {
581
- "epoch": 1.4031805425631432,
582
- "grad_norm": 1.605042815208435,
583
- "learning_rate": 1.4934345009181305e-05,
584
- "loss": 0.2998,
585
- "step": 40500
586
- },
587
- {
588
- "epoch": 1.4205037591379968,
589
- "grad_norm": 1.5163339376449585,
590
- "learning_rate": 1.4501264594809966e-05,
591
- "loss": 0.3031,
592
- "step": 41000
593
- },
594
- {
595
- "epoch": 1.4378269757128503,
596
- "grad_norm": 1.9643980264663696,
597
- "learning_rate": 1.4068184180438623e-05,
598
- "loss": 0.2922,
599
- "step": 41500
600
- },
601
- {
602
- "epoch": 1.455150192287704,
603
- "grad_norm": 1.4320045709609985,
604
- "learning_rate": 1.3635103766067284e-05,
605
- "loss": 0.2858,
606
- "step": 42000
607
- },
608
- {
609
- "epoch": 1.4724734088625575,
610
- "grad_norm": 1.4415267705917358,
611
- "learning_rate": 1.3202889512524685e-05,
612
- "loss": 0.2807,
613
- "step": 42500
614
- },
615
- {
616
- "epoch": 1.4897966254374113,
617
- "grad_norm": 1.1676790714263916,
618
- "learning_rate": 1.2769809098153345e-05,
619
- "loss": 0.279,
620
- "step": 43000
621
- },
622
- {
623
- "epoch": 1.5071198420122647,
624
- "grad_norm": 1.2914575338363647,
625
- "learning_rate": 1.2336728683782005e-05,
626
- "loss": 0.2658,
627
- "step": 43500
628
- },
629
- {
630
- "epoch": 1.5244430585871185,
631
- "grad_norm": 1.8342537879943848,
632
- "learning_rate": 1.1903648269410666e-05,
633
- "loss": 0.2669,
634
- "step": 44000
635
- },
636
- {
637
- "epoch": 1.541766275161972,
638
- "grad_norm": 1.6250979900360107,
639
- "learning_rate": 1.1471434015868068e-05,
640
- "loss": 0.2615,
641
- "step": 44500
642
- },
643
- {
644
- "epoch": 1.5590894917368257,
645
- "grad_norm": 1.8935564756393433,
646
- "learning_rate": 1.1038353601496726e-05,
647
- "loss": 0.2513,
648
- "step": 45000
649
- },
650
- {
651
- "epoch": 1.5764127083116795,
652
- "grad_norm": 1.7190736532211304,
653
- "learning_rate": 1.0605273187125386e-05,
654
- "loss": 0.2488,
655
- "step": 45500
656
- },
657
- {
658
- "epoch": 1.5937359248865328,
659
- "grad_norm": 1.5352853536605835,
660
- "learning_rate": 1.0172192772754045e-05,
661
- "loss": 0.2502,
662
- "step": 46000
663
- },
664
- {
665
- "epoch": 1.6110591414613866,
666
- "grad_norm": 1.452185869216919,
667
- "learning_rate": 9.739112358382704e-06,
668
- "loss": 0.2441,
669
- "step": 46500
670
- },
671
- {
672
- "epoch": 1.6283823580362402,
673
- "grad_norm": 1.2263524532318115,
674
- "learning_rate": 9.306031944011364e-06,
675
- "loss": 0.2366,
676
- "step": 47000
677
- },
678
- {
679
- "epoch": 1.6457055746110938,
680
- "grad_norm": 1.7002058029174805,
681
- "learning_rate": 8.872951529640024e-06,
682
- "loss": 0.2361,
683
- "step": 47500
684
- },
685
- {
686
- "epoch": 1.6630287911859474,
687
- "grad_norm": 0.8775345087051392,
688
- "learning_rate": 8.439871115268685e-06,
689
- "loss": 0.2314,
690
- "step": 48000
691
- },
692
- {
693
- "epoch": 1.680352007760801,
694
- "grad_norm": 1.68264639377594,
695
- "learning_rate": 8.007656861726086e-06,
696
- "loss": 0.2341,
697
- "step": 48500
698
- },
699
- {
700
- "epoch": 1.6976752243356548,
701
- "grad_norm": 1.4811515808105469,
702
- "learning_rate": 7.574576447354745e-06,
703
- "loss": 0.2238,
704
- "step": 49000
705
- },
706
- {
707
- "epoch": 1.7149984409105081,
708
- "grad_norm": 1.5231512784957886,
709
- "learning_rate": 7.142362193812148e-06,
710
- "loss": 0.2229,
711
- "step": 49500
712
- },
713
- {
714
- "epoch": 1.732321657485362,
715
- "grad_norm": 1.870348334312439,
716
- "learning_rate": 6.709281779440807e-06,
717
- "loss": 0.2211,
718
- "step": 50000
719
- },
720
- {
721
- "epoch": 1.7496448740602155,
722
- "grad_norm": 1.2438400983810425,
723
- "learning_rate": 6.276201365069466e-06,
724
- "loss": 0.2205,
725
- "step": 50500
726
- },
727
- {
728
- "epoch": 1.766968090635069,
729
- "grad_norm": 1.426330327987671,
730
- "learning_rate": 5.843120950698125e-06,
731
- "loss": 0.2189,
732
- "step": 51000
733
- },
734
- {
735
- "epoch": 1.7842913072099227,
736
- "grad_norm": 0.9976692199707031,
737
- "learning_rate": 5.410040536326786e-06,
738
- "loss": 0.2168,
739
- "step": 51500
740
- },
741
- {
742
- "epoch": 1.8016145237847763,
743
- "grad_norm": 0.7628383040428162,
744
- "learning_rate": 4.976960121955445e-06,
745
- "loss": 0.2137,
746
- "step": 52000
747
- },
748
- {
749
- "epoch": 1.81893774035963,
750
- "grad_norm": 1.1943764686584473,
751
- "learning_rate": 4.543879707584105e-06,
752
- "loss": 0.2108,
753
- "step": 52500
754
- },
755
- {
756
- "epoch": 1.8362609569344834,
757
- "grad_norm": 1.6423786878585815,
758
- "learning_rate": 4.110799293212764e-06,
759
- "loss": 0.207,
760
- "step": 53000
761
- },
762
- {
763
- "epoch": 1.8535841735093372,
764
- "grad_norm": 1.1439303159713745,
765
- "learning_rate": 3.678585039670166e-06,
766
- "loss": 0.2102,
767
- "step": 53500
768
- },
769
- {
770
- "epoch": 1.8709073900841908,
771
- "grad_norm": 1.1228655576705933,
772
- "learning_rate": 3.2463707861275687e-06,
773
- "loss": 0.2078,
774
- "step": 54000
775
- },
776
- {
777
- "epoch": 1.8882306066590444,
778
- "grad_norm": 1.6855107545852661,
779
- "learning_rate": 2.8132903717562278e-06,
780
- "loss": 0.2095,
781
- "step": 54500
782
- },
783
- {
784
- "epoch": 1.9055538232338982,
785
- "grad_norm": 1.0492366552352905,
786
- "learning_rate": 2.3802099573848872e-06,
787
- "loss": 0.2059,
788
- "step": 55000
789
- },
790
- {
791
- "epoch": 1.9228770398087516,
792
- "grad_norm": 1.1293083429336548,
793
- "learning_rate": 1.947129543013547e-06,
794
- "loss": 0.2084,
795
- "step": 55500
796
- },
797
- {
798
- "epoch": 1.9402002563836054,
799
- "grad_norm": 0.6616119742393494,
800
- "learning_rate": 1.5140491286422063e-06,
801
- "loss": 0.2004,
802
- "step": 56000
803
- },
804
- {
805
- "epoch": 1.957523472958459,
806
- "grad_norm": 1.435498595237732,
807
- "learning_rate": 1.0809687142708658e-06,
808
- "loss": 0.2,
809
- "step": 56500
810
- },
811
- {
812
- "epoch": 1.9748466895333125,
813
- "grad_norm": 1.5348039865493774,
814
- "learning_rate": 6.478882998995254e-07,
815
- "loss": 0.203,
816
- "step": 57000
817
- },
818
- {
819
- "epoch": 1.9921699061081661,
820
- "grad_norm": 1.3262860774993896,
821
- "learning_rate": 2.1480788552818487e-07,
822
- "loss": 0.2066,
823
- "step": 57500
824
- }
825
- ],
826
- "logging_steps": 500,
827
- "max_steps": 57726,
828
- "num_input_tokens_seen": 0,
829
- "num_train_epochs": 2,
830
- "save_steps": 500,
831
- "stateful_callbacks": {
832
- "TrainerControl": {
833
- "args": {
834
- "should_epoch_stop": false,
835
- "should_evaluate": false,
836
- "should_log": false,
837
- "should_save": true,
838
- "should_training_stop": false
839
- },
840
- "attributes": {}
841
- }
842
- },
843
- "total_flos": 1.2451391598703411e+17,
844
- "train_batch_size": 32,
845
- "trial_name": null,
846
- "trial_params": null
847
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-57726/trainer_state.json DELETED
@@ -1,847 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
- "eval_steps": 500,
7
- "global_step": 57726,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.01732321657485362,
14
- "grad_norm": 0.5299897193908691,
15
- "learning_rate": 4.956778574645741e-05,
16
- "loss": 2.1733,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.03464643314970724,
21
- "grad_norm": 0.4744114279747009,
22
- "learning_rate": 4.913470533208606e-05,
23
- "loss": 1.936,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.051969649724560855,
28
- "grad_norm": 0.5358501672744751,
29
- "learning_rate": 4.870162491771472e-05,
30
- "loss": 1.8605,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.06929286629941447,
35
- "grad_norm": 0.5026484131813049,
36
- "learning_rate": 4.826854450334338e-05,
37
- "loss": 1.793,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.0866160828742681,
42
- "grad_norm": 0.5730611681938171,
43
- "learning_rate": 4.7835464088972044e-05,
44
- "loss": 1.7309,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.10393929944912171,
49
- "grad_norm": 0.47230127453804016,
50
- "learning_rate": 4.74023836746007e-05,
51
- "loss": 1.6938,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.12126251602397534,
56
- "grad_norm": 0.6244428157806396,
57
- "learning_rate": 4.6969303260229364e-05,
58
- "loss": 1.6587,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 0.13858573259882895,
63
- "grad_norm": 0.5069366693496704,
64
- "learning_rate": 4.653622284585802e-05,
65
- "loss": 1.6241,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 0.15590894917368256,
70
- "grad_norm": 0.567051887512207,
71
- "learning_rate": 4.610314243148668e-05,
72
- "loss": 1.5789,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 0.1732321657485362,
77
- "grad_norm": 0.5564538836479187,
78
- "learning_rate": 4.567006201711534e-05,
79
- "loss": 1.5619,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 0.1905553823233898,
84
- "grad_norm": 0.7243916988372803,
85
- "learning_rate": 4.5236981602744e-05,
86
- "loss": 1.5358,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 0.20787859889824342,
91
- "grad_norm": 0.6585371494293213,
92
- "learning_rate": 4.480390118837266e-05,
93
- "loss": 1.5396,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 0.22520181547309703,
98
- "grad_norm": 0.5986710786819458,
99
- "learning_rate": 4.437082077400132e-05,
100
- "loss": 1.4991,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 0.24252503204795067,
105
- "grad_norm": 0.6701385974884033,
106
- "learning_rate": 4.393774035962998e-05,
107
- "loss": 1.4823,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 0.25984824862280426,
112
- "grad_norm": 0.6745362877845764,
113
- "learning_rate": 4.350465994525863e-05,
114
- "loss": 1.4715,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 0.2771714651976579,
119
- "grad_norm": 0.6930545568466187,
120
- "learning_rate": 4.30715795308873e-05,
121
- "loss": 1.4354,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 0.29449468177251154,
126
- "grad_norm": 0.6593695878982544,
127
- "learning_rate": 4.2638499116515954e-05,
128
- "loss": 1.432,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 0.3118178983473651,
133
- "grad_norm": 0.6918864846229553,
134
- "learning_rate": 4.220541870214462e-05,
135
- "loss": 1.4209,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 0.32914111492221876,
140
- "grad_norm": 0.7154921889305115,
141
- "learning_rate": 4.1772338287773275e-05,
142
- "loss": 1.391,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 0.3464643314970724,
147
- "grad_norm": 0.8634065389633179,
148
- "learning_rate": 4.133925787340194e-05,
149
- "loss": 1.3843,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 0.363787548071926,
154
- "grad_norm": 0.8820887804031372,
155
- "learning_rate": 4.0906177459030595e-05,
156
- "loss": 1.3612,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 0.3811107646467796,
161
- "grad_norm": 0.9643932580947876,
162
- "learning_rate": 4.047309704465925e-05,
163
- "loss": 1.3326,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 0.39843398122163326,
168
- "grad_norm": 0.9170373678207397,
169
- "learning_rate": 4.0040016630287916e-05,
170
- "loss": 1.3158,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 0.41575719779648684,
175
- "grad_norm": 0.8911833167076111,
176
- "learning_rate": 3.960693621591657e-05,
177
- "loss": 1.2982,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 0.4330804143713405,
182
- "grad_norm": 1.3023812770843506,
183
- "learning_rate": 3.917472196237398e-05,
184
- "loss": 1.2497,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 0.45040363094619407,
189
- "grad_norm": 1.3367282152175903,
190
- "learning_rate": 3.874164154800263e-05,
191
- "loss": 1.2436,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 0.4677268475210477,
196
- "grad_norm": 1.070709228515625,
197
- "learning_rate": 3.8308561133631293e-05,
198
- "loss": 1.2068,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 0.48505006409590135,
203
- "grad_norm": 1.2574782371520996,
204
- "learning_rate": 3.787548071925995e-05,
205
- "loss": 1.1867,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 0.5023732806707549,
210
- "grad_norm": 1.4889111518859863,
211
- "learning_rate": 3.744326646571736e-05,
212
- "loss": 1.1908,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 0.5196964972456085,
217
- "grad_norm": 1.2556302547454834,
218
- "learning_rate": 3.7010186051346014e-05,
219
- "loss": 1.1476,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 0.5370197138204622,
224
- "grad_norm": 1.2582406997680664,
225
- "learning_rate": 3.6577971797803415e-05,
226
- "loss": 1.1268,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 0.5543429303953158,
231
- "grad_norm": 1.4035013914108276,
232
- "learning_rate": 3.614575754426082e-05,
233
- "loss": 1.119,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 0.5716661469701694,
238
- "grad_norm": 1.7117546796798706,
239
- "learning_rate": 3.571267712988948e-05,
240
- "loss": 1.106,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 0.5889893635450231,
245
- "grad_norm": 1.503953218460083,
246
- "learning_rate": 3.5279596715518135e-05,
247
- "loss": 1.08,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 0.6063125801198767,
252
- "grad_norm": 1.6753602027893066,
253
- "learning_rate": 3.48465163011468e-05,
254
- "loss": 1.055,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 0.6236357966947302,
259
- "grad_norm": 1.5606533288955688,
260
- "learning_rate": 3.4413435886775456e-05,
261
- "loss": 1.043,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 0.6409590132695839,
266
- "grad_norm": 2.1966652870178223,
267
- "learning_rate": 3.398122163323286e-05,
268
- "loss": 1.0294,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 0.6582822298444375,
273
- "grad_norm": 2.0083296298980713,
274
- "learning_rate": 3.354814121886152e-05,
275
- "loss": 1.009,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 0.6756054464192911,
280
- "grad_norm": 1.61182701587677,
281
- "learning_rate": 3.311506080449018e-05,
282
- "loss": 0.978,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 0.6929286629941448,
287
- "grad_norm": 1.698030710220337,
288
- "learning_rate": 3.268198039011884e-05,
289
- "loss": 0.9798,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 0.7102518795689984,
294
- "grad_norm": 1.810142993927002,
295
- "learning_rate": 3.224976613657624e-05,
296
- "loss": 0.9558,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 0.727575096143852,
301
- "grad_norm": 4.462728023529053,
302
- "learning_rate": 3.1816685722204904e-05,
303
- "loss": 0.9337,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 0.7448983127187057,
308
- "grad_norm": 3.5566582679748535,
309
- "learning_rate": 3.1384471468662305e-05,
310
- "loss": 0.9115,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 0.7622215292935592,
315
- "grad_norm": 3.2824957370758057,
316
- "learning_rate": 3.095139105429096e-05,
317
- "loss": 0.8992,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 0.7795447458684128,
322
- "grad_norm": 3.1051690578460693,
323
- "learning_rate": 3.0518310639919625e-05,
324
- "loss": 0.8945,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 0.7968679624432665,
329
- "grad_norm": 5.406064510345459,
330
- "learning_rate": 3.0085230225548282e-05,
331
- "loss": 0.8544,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 0.8141911790181201,
336
- "grad_norm": 3.3169949054718018,
337
- "learning_rate": 2.9653015972005682e-05,
338
- "loss": 0.8494,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 0.8315143955929737,
343
- "grad_norm": 2.4768149852752686,
344
- "learning_rate": 2.9219935557634343e-05,
345
- "loss": 0.8145,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 0.8488376121678273,
350
- "grad_norm": 2.6160104274749756,
351
- "learning_rate": 2.8786855143263003e-05,
352
- "loss": 0.7995,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 0.866160828742681,
357
- "grad_norm": 2.7828731536865234,
358
- "learning_rate": 2.8353774728891663e-05,
359
- "loss": 0.7705,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 0.8834840453175345,
364
- "grad_norm": 3.508859157562256,
365
- "learning_rate": 2.7921560475349063e-05,
366
- "loss": 0.7565,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 0.9008072618923881,
371
- "grad_norm": 2.672107219696045,
372
- "learning_rate": 2.7488480060977724e-05,
373
- "loss": 0.7394,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 0.9181304784672418,
378
- "grad_norm": 2.712963819503784,
379
- "learning_rate": 2.7055399646606384e-05,
380
- "loss": 0.7144,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 0.9354536950420954,
385
- "grad_norm": 4.505606651306152,
386
- "learning_rate": 2.6622319232235044e-05,
387
- "loss": 0.6964,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 0.952776911616949,
392
- "grad_norm": 1.9717707633972168,
393
- "learning_rate": 2.619010497869244e-05,
394
- "loss": 0.6642,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 0.9701001281918027,
399
- "grad_norm": 2.501847743988037,
400
- "learning_rate": 2.57570245643211e-05,
401
- "loss": 0.6542,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 0.9874233447666563,
406
- "grad_norm": 4.659865379333496,
407
- "learning_rate": 2.532481031077851e-05,
408
- "loss": 0.622,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 1.0,
413
- "eval_loss": 0.3914269506931305,
414
- "eval_runtime": 1756.0029,
415
- "eval_samples_per_second": 350.648,
416
- "eval_steps_per_second": 43.831,
417
- "step": 28863
418
- },
419
- {
420
- "epoch": 1.0047465613415099,
421
- "grad_norm": 1.5512120723724365,
422
- "learning_rate": 2.4891729896407165e-05,
423
- "loss": 0.6,
424
- "step": 29000
425
- },
426
- {
427
- "epoch": 1.0220697779163634,
428
- "grad_norm": 2.7224926948547363,
429
- "learning_rate": 2.4458649482035826e-05,
430
- "loss": 0.5733,
431
- "step": 29500
432
- },
433
- {
434
- "epoch": 1.039392994491217,
435
- "grad_norm": 2.0893383026123047,
436
- "learning_rate": 2.4025569067664486e-05,
437
- "loss": 0.5517,
438
- "step": 30000
439
- },
440
- {
441
- "epoch": 1.0567162110660708,
442
- "grad_norm": 2.235480785369873,
443
- "learning_rate": 2.3592488653293143e-05,
444
- "loss": 0.5389,
445
- "step": 30500
446
- },
447
- {
448
- "epoch": 1.0740394276409244,
449
- "grad_norm": 2.4926114082336426,
450
- "learning_rate": 2.3159408238921803e-05,
451
- "loss": 0.5282,
452
- "step": 31000
453
- },
454
- {
455
- "epoch": 1.091362644215778,
456
- "grad_norm": 1.9329782724380493,
457
- "learning_rate": 2.2726327824550464e-05,
458
- "loss": 0.5076,
459
- "step": 31500
460
- },
461
- {
462
- "epoch": 1.1086858607906316,
463
- "grad_norm": 1.8705673217773438,
464
- "learning_rate": 2.2293247410179124e-05,
465
- "loss": 0.489,
466
- "step": 32000
467
- },
468
- {
469
- "epoch": 1.1260090773654852,
470
- "grad_norm": 1.8372488021850586,
471
- "learning_rate": 2.186016699580778e-05,
472
- "loss": 0.4701,
473
- "step": 32500
474
- },
475
- {
476
- "epoch": 1.1433322939403388,
477
- "grad_norm": 1.7131813764572144,
478
- "learning_rate": 2.142708658143644e-05,
479
- "loss": 0.4592,
480
- "step": 33000
481
- },
482
- {
483
- "epoch": 1.1606555105151926,
484
- "grad_norm": 3.1449172496795654,
485
- "learning_rate": 2.0994872327893845e-05,
486
- "loss": 0.4408,
487
- "step": 33500
488
- },
489
- {
490
- "epoch": 1.1779787270900461,
491
- "grad_norm": 2.199096918106079,
492
- "learning_rate": 2.0561791913522505e-05,
493
- "loss": 0.4278,
494
- "step": 34000
495
- },
496
- {
497
- "epoch": 1.1953019436648997,
498
- "grad_norm": 1.6359717845916748,
499
- "learning_rate": 2.0128711499151162e-05,
500
- "loss": 0.4238,
501
- "step": 34500
502
- },
503
- {
504
- "epoch": 1.2126251602397533,
505
- "grad_norm": 1.7747199535369873,
506
- "learning_rate": 1.9695631084779822e-05,
507
- "loss": 0.4024,
508
- "step": 35000
509
- },
510
- {
511
- "epoch": 1.229948376814607,
512
- "grad_norm": 2.073904037475586,
513
- "learning_rate": 1.9263416831237226e-05,
514
- "loss": 0.39,
515
- "step": 35500
516
- },
517
- {
518
- "epoch": 1.2472715933894605,
519
- "grad_norm": 1.9012521505355835,
520
- "learning_rate": 1.8830336416865886e-05,
521
- "loss": 0.3856,
522
- "step": 36000
523
- },
524
- {
525
- "epoch": 1.264594809964314,
526
- "grad_norm": 2.2919228076934814,
527
- "learning_rate": 1.8397256002494546e-05,
528
- "loss": 0.3729,
529
- "step": 36500
530
- },
531
- {
532
- "epoch": 1.2819180265391679,
533
- "grad_norm": 1.933583378791809,
534
- "learning_rate": 1.7964175588123203e-05,
535
- "loss": 0.3584,
536
- "step": 37000
537
- },
538
- {
539
- "epoch": 1.2992412431140214,
540
- "grad_norm": 2.1168253421783447,
541
- "learning_rate": 1.7531095173751864e-05,
542
- "loss": 0.356,
543
- "step": 37500
544
- },
545
- {
546
- "epoch": 1.316564459688875,
547
- "grad_norm": 1.7472686767578125,
548
- "learning_rate": 1.7098014759380524e-05,
549
- "loss": 0.339,
550
- "step": 38000
551
- },
552
- {
553
- "epoch": 1.3338876762637286,
554
- "grad_norm": 1.9178825616836548,
555
- "learning_rate": 1.666493434500918e-05,
556
- "loss": 0.3294,
557
- "step": 38500
558
- },
559
- {
560
- "epoch": 1.3512108928385822,
561
- "grad_norm": 1.4763795137405396,
562
- "learning_rate": 1.623185393063784e-05,
563
- "loss": 0.3299,
564
- "step": 39000
565
- },
566
- {
567
- "epoch": 1.368534109413436,
568
- "grad_norm": 1.974442481994629,
569
- "learning_rate": 1.57987735162665e-05,
570
- "loss": 0.3206,
571
- "step": 39500
572
- },
573
- {
574
- "epoch": 1.3858573259882894,
575
- "grad_norm": 1.315453052520752,
576
- "learning_rate": 1.5366559262723905e-05,
577
- "loss": 0.313,
578
- "step": 40000
579
- },
580
- {
581
- "epoch": 1.4031805425631432,
582
- "grad_norm": 1.605042815208435,
583
- "learning_rate": 1.4934345009181305e-05,
584
- "loss": 0.2998,
585
- "step": 40500
586
- },
587
- {
588
- "epoch": 1.4205037591379968,
589
- "grad_norm": 1.5163339376449585,
590
- "learning_rate": 1.4501264594809966e-05,
591
- "loss": 0.3031,
592
- "step": 41000
593
- },
594
- {
595
- "epoch": 1.4378269757128503,
596
- "grad_norm": 1.9643980264663696,
597
- "learning_rate": 1.4068184180438623e-05,
598
- "loss": 0.2922,
599
- "step": 41500
600
- },
601
- {
602
- "epoch": 1.455150192287704,
603
- "grad_norm": 1.4320045709609985,
604
- "learning_rate": 1.3635103766067284e-05,
605
- "loss": 0.2858,
606
- "step": 42000
607
- },
608
- {
609
- "epoch": 1.4724734088625575,
610
- "grad_norm": 1.4415267705917358,
611
- "learning_rate": 1.3202889512524685e-05,
612
- "loss": 0.2807,
613
- "step": 42500
614
- },
615
- {
616
- "epoch": 1.4897966254374113,
617
- "grad_norm": 1.1676790714263916,
618
- "learning_rate": 1.2769809098153345e-05,
619
- "loss": 0.279,
620
- "step": 43000
621
- },
622
- {
623
- "epoch": 1.5071198420122647,
624
- "grad_norm": 1.2914575338363647,
625
- "learning_rate": 1.2336728683782005e-05,
626
- "loss": 0.2658,
627
- "step": 43500
628
- },
629
- {
630
- "epoch": 1.5244430585871185,
631
- "grad_norm": 1.8342537879943848,
632
- "learning_rate": 1.1903648269410666e-05,
633
- "loss": 0.2669,
634
- "step": 44000
635
- },
636
- {
637
- "epoch": 1.541766275161972,
638
- "grad_norm": 1.6250979900360107,
639
- "learning_rate": 1.1471434015868068e-05,
640
- "loss": 0.2615,
641
- "step": 44500
642
- },
643
- {
644
- "epoch": 1.5590894917368257,
645
- "grad_norm": 1.8935564756393433,
646
- "learning_rate": 1.1038353601496726e-05,
647
- "loss": 0.2513,
648
- "step": 45000
649
- },
650
- {
651
- "epoch": 1.5764127083116795,
652
- "grad_norm": 1.7190736532211304,
653
- "learning_rate": 1.0605273187125386e-05,
654
- "loss": 0.2488,
655
- "step": 45500
656
- },
657
- {
658
- "epoch": 1.5937359248865328,
659
- "grad_norm": 1.5352853536605835,
660
- "learning_rate": 1.0172192772754045e-05,
661
- "loss": 0.2502,
662
- "step": 46000
663
- },
664
- {
665
- "epoch": 1.6110591414613866,
666
- "grad_norm": 1.452185869216919,
667
- "learning_rate": 9.739112358382704e-06,
668
- "loss": 0.2441,
669
- "step": 46500
670
- },
671
- {
672
- "epoch": 1.6283823580362402,
673
- "grad_norm": 1.2263524532318115,
674
- "learning_rate": 9.306031944011364e-06,
675
- "loss": 0.2366,
676
- "step": 47000
677
- },
678
- {
679
- "epoch": 1.6457055746110938,
680
- "grad_norm": 1.7002058029174805,
681
- "learning_rate": 8.872951529640024e-06,
682
- "loss": 0.2361,
683
- "step": 47500
684
- },
685
- {
686
- "epoch": 1.6630287911859474,
687
- "grad_norm": 0.8775345087051392,
688
- "learning_rate": 8.439871115268685e-06,
689
- "loss": 0.2314,
690
- "step": 48000
691
- },
692
- {
693
- "epoch": 1.680352007760801,
694
- "grad_norm": 1.68264639377594,
695
- "learning_rate": 8.007656861726086e-06,
696
- "loss": 0.2341,
697
- "step": 48500
698
- },
699
- {
700
- "epoch": 1.6976752243356548,
701
- "grad_norm": 1.4811515808105469,
702
- "learning_rate": 7.574576447354745e-06,
703
- "loss": 0.2238,
704
- "step": 49000
705
- },
706
- {
707
- "epoch": 1.7149984409105081,
708
- "grad_norm": 1.5231512784957886,
709
- "learning_rate": 7.142362193812148e-06,
710
- "loss": 0.2229,
711
- "step": 49500
712
- },
713
- {
714
- "epoch": 1.732321657485362,
715
- "grad_norm": 1.870348334312439,
716
- "learning_rate": 6.709281779440807e-06,
717
- "loss": 0.2211,
718
- "step": 50000
719
- },
720
- {
721
- "epoch": 1.7496448740602155,
722
- "grad_norm": 1.2438400983810425,
723
- "learning_rate": 6.276201365069466e-06,
724
- "loss": 0.2205,
725
- "step": 50500
726
- },
727
- {
728
- "epoch": 1.766968090635069,
729
- "grad_norm": 1.426330327987671,
730
- "learning_rate": 5.843120950698125e-06,
731
- "loss": 0.2189,
732
- "step": 51000
733
- },
734
- {
735
- "epoch": 1.7842913072099227,
736
- "grad_norm": 0.9976692199707031,
737
- "learning_rate": 5.410040536326786e-06,
738
- "loss": 0.2168,
739
- "step": 51500
740
- },
741
- {
742
- "epoch": 1.8016145237847763,
743
- "grad_norm": 0.7628383040428162,
744
- "learning_rate": 4.976960121955445e-06,
745
- "loss": 0.2137,
746
- "step": 52000
747
- },
748
- {
749
- "epoch": 1.81893774035963,
750
- "grad_norm": 1.1943764686584473,
751
- "learning_rate": 4.543879707584105e-06,
752
- "loss": 0.2108,
753
- "step": 52500
754
- },
755
- {
756
- "epoch": 1.8362609569344834,
757
- "grad_norm": 1.6423786878585815,
758
- "learning_rate": 4.110799293212764e-06,
759
- "loss": 0.207,
760
- "step": 53000
761
- },
762
- {
763
- "epoch": 1.8535841735093372,
764
- "grad_norm": 1.1439303159713745,
765
- "learning_rate": 3.678585039670166e-06,
766
- "loss": 0.2102,
767
- "step": 53500
768
- },
769
- {
770
- "epoch": 1.8709073900841908,
771
- "grad_norm": 1.1228655576705933,
772
- "learning_rate": 3.2463707861275687e-06,
773
- "loss": 0.2078,
774
- "step": 54000
775
- },
776
- {
777
- "epoch": 1.8882306066590444,
778
- "grad_norm": 1.6855107545852661,
779
- "learning_rate": 2.8132903717562278e-06,
780
- "loss": 0.2095,
781
- "step": 54500
782
- },
783
- {
784
- "epoch": 1.9055538232338982,
785
- "grad_norm": 1.0492366552352905,
786
- "learning_rate": 2.3802099573848872e-06,
787
- "loss": 0.2059,
788
- "step": 55000
789
- },
790
- {
791
- "epoch": 1.9228770398087516,
792
- "grad_norm": 1.1293083429336548,
793
- "learning_rate": 1.947129543013547e-06,
794
- "loss": 0.2084,
795
- "step": 55500
796
- },
797
- {
798
- "epoch": 1.9402002563836054,
799
- "grad_norm": 0.6616119742393494,
800
- "learning_rate": 1.5140491286422063e-06,
801
- "loss": 0.2004,
802
- "step": 56000
803
- },
804
- {
805
- "epoch": 1.957523472958459,
806
- "grad_norm": 1.435498595237732,
807
- "learning_rate": 1.0809687142708658e-06,
808
- "loss": 0.2,
809
- "step": 56500
810
- },
811
- {
812
- "epoch": 1.9748466895333125,
813
- "grad_norm": 1.5348039865493774,
814
- "learning_rate": 6.478882998995254e-07,
815
- "loss": 0.203,
816
- "step": 57000
817
- },
818
- {
819
- "epoch": 1.9921699061081661,
820
- "grad_norm": 1.3262860774993896,
821
- "learning_rate": 2.1480788552818487e-07,
822
- "loss": 0.2066,
823
- "step": 57500
824
- }
825
- ],
826
- "logging_steps": 500,
827
- "max_steps": 57726,
828
- "num_input_tokens_seen": 0,
829
- "num_train_epochs": 2,
830
- "save_steps": 500,
831
- "stateful_callbacks": {
832
- "TrainerControl": {
833
- "args": {
834
- "should_epoch_stop": false,
835
- "should_evaluate": false,
836
- "should_log": false,
837
- "should_save": true,
838
- "should_training_stop": true
839
- },
840
- "attributes": {}
841
- }
842
- },
843
- "total_flos": 1.2500277057395098e+17,
844
- "train_batch_size": 32,
845
- "trial_name": null,
846
- "trial_params": null
847
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/{checkpoint-57000 → checkpoint-61500}/config.json RENAMED
File without changes
checkpoints/{checkpoint-57000 → checkpoint-61500}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-57000 → checkpoint-61500}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63f9895975a5b82ed4c29e442ce72159b3b7c10a9a2b4da93eb7ccb5c6e40a81
3
  size 258368552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c04cb0e96a506016467863a6b4674ff881a6422d2034e2295fbf658914a398
3
  size 258368552
checkpoints/{checkpoint-57500 → checkpoint-61500}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d1130fd40d0adbcf44c8ca3f3524cfbbf67046885893cb932a54d0c64477f63
3
  size 516816826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bec2188f9ea536c328c3af0b020aed0f233ddedcafae07ec2d0d3e2ee0120d24
3
  size 516816826
checkpoints/{checkpoint-57726 → checkpoint-61500}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3315b2708c7cd31671d4955cf490a8f696ef2ad4b7c7fd2d387894f8d3f5b967
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a47ee310263d5d6300401264828ccfb4fec9c1c4181961358106a0f617b86d6f
3
  size 14244
checkpoints/{checkpoint-57726 → checkpoint-61500}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e342b3190fdac8d92b381a057bbbe10df6254d7b7bfbbe82ee33ca0c7910233e
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c521173cdf16365f5a532f1fd1c976b664d848a2f833b7b603db15856db89a60
3
  size 988
checkpoints/{checkpoint-57000 → checkpoint-61500}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae85305bb47ec250d63523be1d1c003050ef3273ae9c0216554b0430f168d054
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f1363f9495d21091d229315dab764d88aa8a5a850ff84e8ec9dd37b5a8c9d6f
3
  size 1064
checkpoints/{checkpoint-57000 → checkpoint-61500}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-57500 → checkpoint-61500}/spiece.model RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deb4456f78071a8ab6ca5a1698b8a196823b52dea960f8bb0cbcbe082828ead4
3
- size 1040899
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610d79d092a886e9af7d31b94512c46fd20a5f62557bd5b8d8a5b23f1f78650a
3
+ size 1042971
checkpoints/{checkpoint-57000 → checkpoint-61500}/tokenizer.json RENAMED
The diff for this file is too large to render. See raw diff
 
checkpoints/{checkpoint-57000 → checkpoint-61500}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-61500/trainer_state.json ADDED
@@ -0,0 +1,903 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9785892383173813,
6
+ "eval_steps": 500,
7
+ "global_step": 61500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016086222150727903,
14
+ "grad_norm": 0.5078127384185791,
15
+ "learning_rate": 4.959864230101023e-05,
16
+ "loss": 2.1432,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.032172444301455806,
21
+ "grad_norm": 0.4508506655693054,
22
+ "learning_rate": 4.9196480277974395e-05,
23
+ "loss": 1.9093,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.048258666452183706,
28
+ "grad_norm": 0.4430558979511261,
29
+ "learning_rate": 4.879431825493855e-05,
30
+ "loss": 1.8418,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.06434488860291161,
35
+ "grad_norm": 0.4775325059890747,
36
+ "learning_rate": 4.8392156231902713e-05,
37
+ "loss": 1.7771,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.08043111075363951,
42
+ "grad_norm": 0.49685001373291016,
43
+ "learning_rate": 4.7989994208866876e-05,
44
+ "loss": 1.7226,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.09651733290436741,
49
+ "grad_norm": 0.5552434325218201,
50
+ "learning_rate": 4.7587832185831025e-05,
51
+ "loss": 1.6767,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.11260355505509531,
56
+ "grad_norm": 0.6779139637947083,
57
+ "learning_rate": 4.718567016279519e-05,
58
+ "loss": 1.6588,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.12868977720582322,
63
+ "grad_norm": 0.5552022457122803,
64
+ "learning_rate": 4.6783508139759344e-05,
65
+ "loss": 1.603,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.1447759993565511,
70
+ "grad_norm": 0.5302042365074158,
71
+ "learning_rate": 4.638134611672351e-05,
72
+ "loss": 1.5776,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.16086222150727902,
77
+ "grad_norm": 0.5810815691947937,
78
+ "learning_rate": 4.597918409368766e-05,
79
+ "loss": 1.5333,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 0.1769484436580069,
84
+ "grad_norm": 0.5819700956344604,
85
+ "learning_rate": 4.5577022070651826e-05,
86
+ "loss": 1.5168,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 0.19303466580873482,
91
+ "grad_norm": 0.6134072542190552,
92
+ "learning_rate": 4.517486004761599e-05,
93
+ "loss": 1.4748,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 0.2091208879594627,
98
+ "grad_norm": 0.5746152400970459,
99
+ "learning_rate": 4.4772698024580144e-05,
100
+ "loss": 1.4622,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 0.22520711011019062,
105
+ "grad_norm": 0.7663710713386536,
106
+ "learning_rate": 4.437053600154431e-05,
107
+ "loss": 1.4767,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 0.24129333226091854,
112
+ "grad_norm": 0.7993176579475403,
113
+ "learning_rate": 4.396837397850846e-05,
114
+ "loss": 1.4527,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 0.25737955441164645,
119
+ "grad_norm": 0.6892676949501038,
120
+ "learning_rate": 4.3566211955472626e-05,
121
+ "loss": 1.4325,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 0.2734657765623743,
126
+ "grad_norm": 0.6928556561470032,
127
+ "learning_rate": 4.316404993243678e-05,
128
+ "loss": 1.4038,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 0.2895519987131022,
133
+ "grad_norm": 0.7578593492507935,
134
+ "learning_rate": 4.2761887909400944e-05,
135
+ "loss": 1.3945,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 0.30563822086383013,
140
+ "grad_norm": 0.7504703402519226,
141
+ "learning_rate": 4.23597258863651e-05,
142
+ "loss": 1.3644,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 0.32172444301455805,
147
+ "grad_norm": 0.8370710611343384,
148
+ "learning_rate": 4.1957563863329256e-05,
149
+ "loss": 1.3619,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 0.3378106651652859,
154
+ "grad_norm": 0.8501142263412476,
155
+ "learning_rate": 4.155540184029342e-05,
156
+ "loss": 1.3448,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 0.3538968873160138,
161
+ "grad_norm": 0.9001900553703308,
162
+ "learning_rate": 4.1153239817257575e-05,
163
+ "loss": 1.3004,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 0.36998310946674173,
168
+ "grad_norm": 1.0658681392669678,
169
+ "learning_rate": 4.075107779422174e-05,
170
+ "loss": 1.2789,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 0.38606933161746965,
175
+ "grad_norm": 1.1038371324539185,
176
+ "learning_rate": 4.0348915771185894e-05,
177
+ "loss": 1.2651,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 0.40215555376819756,
182
+ "grad_norm": 1.2004213333129883,
183
+ "learning_rate": 3.994755807219613e-05,
184
+ "loss": 1.2216,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 0.4182417759189254,
189
+ "grad_norm": 1.235543966293335,
190
+ "learning_rate": 3.9545396049160286e-05,
191
+ "loss": 1.1955,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 0.43432799806965333,
196
+ "grad_norm": 1.5088828802108765,
197
+ "learning_rate": 3.914323402612445e-05,
198
+ "loss": 1.1836,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 0.45041422022038125,
203
+ "grad_norm": 1.264153242111206,
204
+ "learning_rate": 3.8741072003088605e-05,
205
+ "loss": 1.1658,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 0.46650044237110916,
210
+ "grad_norm": 1.3023343086242676,
211
+ "learning_rate": 3.833971430409884e-05,
212
+ "loss": 1.1481,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 0.48258666452183707,
217
+ "grad_norm": 1.3824670314788818,
218
+ "learning_rate": 3.7938356605109064e-05,
219
+ "loss": 1.1221,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 0.49867288667256493,
224
+ "grad_norm": 1.4364969730377197,
225
+ "learning_rate": 3.75369989061193e-05,
226
+ "loss": 1.1057,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 0.5147591088232929,
231
+ "grad_norm": 2.051701545715332,
232
+ "learning_rate": 3.7134836883083456e-05,
233
+ "loss": 1.0873,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 0.5308453309740208,
238
+ "grad_norm": 1.4329720735549927,
239
+ "learning_rate": 3.673267486004762e-05,
240
+ "loss": 1.0607,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 0.5469315531247486,
245
+ "grad_norm": 1.4981014728546143,
246
+ "learning_rate": 3.6330512837011775e-05,
247
+ "loss": 1.0516,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 0.5630177752754766,
252
+ "grad_norm": 1.3012079000473022,
253
+ "learning_rate": 3.592835081397594e-05,
254
+ "loss": 1.0317,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 0.5791039974262044,
259
+ "grad_norm": 1.401825189590454,
260
+ "learning_rate": 3.552699311498617e-05,
261
+ "loss": 1.0183,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 0.5951902195769324,
266
+ "grad_norm": 2.0783369541168213,
267
+ "learning_rate": 3.512483109195033e-05,
268
+ "loss": 0.9985,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 0.6112764417276603,
273
+ "grad_norm": 2.3940794467926025,
274
+ "learning_rate": 3.4722669068914486e-05,
275
+ "loss": 0.9698,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 0.6273626638783881,
280
+ "grad_norm": 1.4747998714447021,
281
+ "learning_rate": 3.432050704587865e-05,
282
+ "loss": 0.9657,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 0.6434488860291161,
287
+ "grad_norm": 3.0782012939453125,
288
+ "learning_rate": 3.391914934688888e-05,
289
+ "loss": 0.9379,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 0.659535108179844,
294
+ "grad_norm": 2.4914307594299316,
295
+ "learning_rate": 3.3516987323853034e-05,
296
+ "loss": 0.915,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 0.6756213303305718,
301
+ "grad_norm": 2.772120237350464,
302
+ "learning_rate": 3.3115629624863264e-05,
303
+ "loss": 0.9047,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 0.6917075524812998,
308
+ "grad_norm": 2.519575595855713,
309
+ "learning_rate": 3.271346760182743e-05,
310
+ "loss": 0.8688,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 0.7077937746320276,
315
+ "grad_norm": 4.085098743438721,
316
+ "learning_rate": 3.231130557879158e-05,
317
+ "loss": 0.8581,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 0.7238799967827556,
322
+ "grad_norm": 1.4670002460479736,
323
+ "learning_rate": 3.1909143555755745e-05,
324
+ "loss": 0.8354,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 0.7399662189334835,
329
+ "grad_norm": 2.4749488830566406,
330
+ "learning_rate": 3.1507785856765975e-05,
331
+ "loss": 0.8108,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 0.7560524410842113,
336
+ "grad_norm": 1.8635029792785645,
337
+ "learning_rate": 3.110562383373014e-05,
338
+ "loss": 0.7773,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 0.7721386632349393,
343
+ "grad_norm": 3.5713748931884766,
344
+ "learning_rate": 3.0703461810694294e-05,
345
+ "loss": 0.756,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 0.7882248853856672,
350
+ "grad_norm": 1.8903526067733765,
351
+ "learning_rate": 3.0301299787658456e-05,
352
+ "loss": 0.7326,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 0.8043111075363951,
357
+ "grad_norm": 8.286703109741211,
358
+ "learning_rate": 2.9899942088668686e-05,
359
+ "loss": 0.6948,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 0.820397329687123,
364
+ "grad_norm": 2.2209272384643555,
365
+ "learning_rate": 2.9497780065632845e-05,
366
+ "loss": 0.6914,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 0.8364835518378508,
371
+ "grad_norm": 2.2284536361694336,
372
+ "learning_rate": 2.9095618042597e-05,
373
+ "loss": 0.6585,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 0.8525697739885788,
378
+ "grad_norm": 3.4615938663482666,
379
+ "learning_rate": 2.869345601956116e-05,
380
+ "loss": 0.633,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 0.8686559961393067,
385
+ "grad_norm": 3.1158838272094727,
386
+ "learning_rate": 2.829209832057139e-05,
387
+ "loss": 0.6181,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 0.8847422182900346,
392
+ "grad_norm": 2.3320417404174805,
393
+ "learning_rate": 2.7889936297535553e-05,
394
+ "loss": 0.5993,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 0.9008284404407625,
399
+ "grad_norm": 1.8331427574157715,
400
+ "learning_rate": 2.7487774274499712e-05,
401
+ "loss": 0.5839,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 0.9169146625914903,
406
+ "grad_norm": 3.2398369312286377,
407
+ "learning_rate": 2.708561225146387e-05,
408
+ "loss": 0.562,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 0.9330008847422183,
413
+ "grad_norm": 1.6575061082839966,
414
+ "learning_rate": 2.66842545524741e-05,
415
+ "loss": 0.5313,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 0.9490871068929462,
420
+ "grad_norm": 2.1604230403900146,
421
+ "learning_rate": 2.6282092529438264e-05,
422
+ "loss": 0.5203,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 0.9651733290436741,
427
+ "grad_norm": 3.3743808269500732,
428
+ "learning_rate": 2.5879930506402423e-05,
429
+ "loss": 0.4938,
430
+ "step": 30000
431
+ },
432
+ {
433
+ "epoch": 0.981259551194402,
434
+ "grad_norm": 3.766514301300049,
435
+ "learning_rate": 2.5477768483366583e-05,
436
+ "loss": 0.4724,
437
+ "step": 30500
438
+ },
439
+ {
440
+ "epoch": 0.9973457733451299,
441
+ "grad_norm": 2.26712703704834,
442
+ "learning_rate": 2.5075606460330742e-05,
443
+ "loss": 0.4656,
444
+ "step": 31000
445
+ },
446
+ {
447
+ "epoch": 1.0,
448
+ "eval_loss": 0.26554691791534424,
449
+ "eval_runtime": 1917.4803,
450
+ "eval_samples_per_second": 345.81,
451
+ "eval_steps_per_second": 43.227,
452
+ "step": 31083
453
+ },
454
+ {
455
+ "epoch": 1.0134159092737072,
456
+ "grad_norm": 2.1041958332061768,
457
+ "learning_rate": 2.467424876134097e-05,
458
+ "loss": 0.4381,
459
+ "step": 31500
460
+ },
461
+ {
462
+ "epoch": 1.029502131424435,
463
+ "grad_norm": 1.7629106044769287,
464
+ "learning_rate": 2.427208673830513e-05,
465
+ "loss": 0.4298,
466
+ "step": 32000
467
+ },
468
+ {
469
+ "epoch": 1.0455883535751629,
470
+ "grad_norm": 2.5032904148101807,
471
+ "learning_rate": 2.386992471526929e-05,
472
+ "loss": 0.4188,
473
+ "step": 32500
474
+ },
475
+ {
476
+ "epoch": 1.0616745757258907,
477
+ "grad_norm": 1.6467881202697754,
478
+ "learning_rate": 2.3467762692233446e-05,
479
+ "loss": 0.3986,
480
+ "step": 33000
481
+ },
482
+ {
483
+ "epoch": 1.0777607978766186,
484
+ "grad_norm": 1.957220435142517,
485
+ "learning_rate": 2.3065600669197606e-05,
486
+ "loss": 0.382,
487
+ "step": 33500
488
+ },
489
+ {
490
+ "epoch": 1.0938470200273467,
491
+ "grad_norm": 1.6566946506500244,
492
+ "learning_rate": 2.2663438646161765e-05,
493
+ "loss": 0.3689,
494
+ "step": 34000
495
+ },
496
+ {
497
+ "epoch": 1.1099332421780745,
498
+ "grad_norm": 2.081613540649414,
499
+ "learning_rate": 2.2261276623125928e-05,
500
+ "loss": 0.3603,
501
+ "step": 34500
502
+ },
503
+ {
504
+ "epoch": 1.1260194643288024,
505
+ "grad_norm": 2.155226945877075,
506
+ "learning_rate": 2.1859918924136157e-05,
507
+ "loss": 0.3478,
508
+ "step": 35000
509
+ },
510
+ {
511
+ "epoch": 1.1421056864795303,
512
+ "grad_norm": 1.9459590911865234,
513
+ "learning_rate": 2.1457756901100317e-05,
514
+ "loss": 0.3315,
515
+ "step": 35500
516
+ },
517
+ {
518
+ "epoch": 1.1581919086302581,
519
+ "grad_norm": 2.3381567001342773,
520
+ "learning_rate": 2.1055594878064476e-05,
521
+ "loss": 0.3259,
522
+ "step": 36000
523
+ },
524
+ {
525
+ "epoch": 1.1742781307809862,
526
+ "grad_norm": 1.4302254915237427,
527
+ "learning_rate": 2.0653432855028635e-05,
528
+ "loss": 0.3168,
529
+ "step": 36500
530
+ },
531
+ {
532
+ "epoch": 1.190364352931714,
533
+ "grad_norm": 1.1770597696304321,
534
+ "learning_rate": 2.0251270831992795e-05,
535
+ "loss": 0.3082,
536
+ "step": 37000
537
+ },
538
+ {
539
+ "epoch": 1.206450575082442,
540
+ "grad_norm": 1.7475298643112183,
541
+ "learning_rate": 1.9849913133003024e-05,
542
+ "loss": 0.3014,
543
+ "step": 37500
544
+ },
545
+ {
546
+ "epoch": 1.2225367972331698,
547
+ "grad_norm": 1.2397468090057373,
548
+ "learning_rate": 1.9447751109967187e-05,
549
+ "loss": 0.288,
550
+ "step": 38000
551
+ },
552
+ {
553
+ "epoch": 1.2386230193838976,
554
+ "grad_norm": 1.6603740453720093,
555
+ "learning_rate": 1.9045589086931343e-05,
556
+ "loss": 0.2797,
557
+ "step": 38500
558
+ },
559
+ {
560
+ "epoch": 1.2547092415346257,
561
+ "grad_norm": 1.7009538412094116,
562
+ "learning_rate": 1.8643427063895502e-05,
563
+ "loss": 0.275,
564
+ "step": 39000
565
+ },
566
+ {
567
+ "epoch": 1.2707954636853536,
568
+ "grad_norm": 1.4941717386245728,
569
+ "learning_rate": 1.8241265040859662e-05,
570
+ "loss": 0.2623,
571
+ "step": 39500
572
+ },
573
+ {
574
+ "epoch": 1.2868816858360814,
575
+ "grad_norm": 1.941115140914917,
576
+ "learning_rate": 1.7839907341869895e-05,
577
+ "loss": 0.2572,
578
+ "step": 40000
579
+ },
580
+ {
581
+ "epoch": 1.3029679079868093,
582
+ "grad_norm": 1.487726092338562,
583
+ "learning_rate": 1.7437745318834054e-05,
584
+ "loss": 0.2502,
585
+ "step": 40500
586
+ },
587
+ {
588
+ "epoch": 1.3190541301375371,
589
+ "grad_norm": 1.4628674983978271,
590
+ "learning_rate": 1.7035583295798213e-05,
591
+ "loss": 0.2437,
592
+ "step": 41000
593
+ },
594
+ {
595
+ "epoch": 1.3351403522882652,
596
+ "grad_norm": 1.401607632637024,
597
+ "learning_rate": 1.663342127276237e-05,
598
+ "loss": 0.2421,
599
+ "step": 41500
600
+ },
601
+ {
602
+ "epoch": 1.351226574438993,
603
+ "grad_norm": 1.1497563123703003,
604
+ "learning_rate": 1.623125924972653e-05,
605
+ "loss": 0.231,
606
+ "step": 42000
607
+ },
608
+ {
609
+ "epoch": 1.367312796589721,
610
+ "grad_norm": 1.322836995124817,
611
+ "learning_rate": 1.5829097226690688e-05,
612
+ "loss": 0.2261,
613
+ "step": 42500
614
+ },
615
+ {
616
+ "epoch": 1.3833990187404488,
617
+ "grad_norm": 1.5328525304794312,
618
+ "learning_rate": 1.542773952770092e-05,
619
+ "loss": 0.2177,
620
+ "step": 43000
621
+ },
622
+ {
623
+ "epoch": 1.3994852408911767,
624
+ "grad_norm": 1.7748241424560547,
625
+ "learning_rate": 1.502557750466508e-05,
626
+ "loss": 0.2186,
627
+ "step": 43500
628
+ },
629
+ {
630
+ "epoch": 1.4155714630419047,
631
+ "grad_norm": 1.6542141437530518,
632
+ "learning_rate": 1.4623415481629241e-05,
633
+ "loss": 0.2138,
634
+ "step": 44000
635
+ },
636
+ {
637
+ "epoch": 1.4316576851926326,
638
+ "grad_norm": 1.3098843097686768,
639
+ "learning_rate": 1.4221253458593397e-05,
640
+ "loss": 0.211,
641
+ "step": 44500
642
+ },
643
+ {
644
+ "epoch": 1.4477439073433604,
645
+ "grad_norm": 1.345651626586914,
646
+ "learning_rate": 1.3819091435557557e-05,
647
+ "loss": 0.2027,
648
+ "step": 45000
649
+ },
650
+ {
651
+ "epoch": 1.4638301294940883,
652
+ "grad_norm": 1.4520297050476074,
653
+ "learning_rate": 1.3416929412521718e-05,
654
+ "loss": 0.2039,
655
+ "step": 45500
656
+ },
657
+ {
658
+ "epoch": 1.4799163516448162,
659
+ "grad_norm": 1.5913499593734741,
660
+ "learning_rate": 1.3014767389485877e-05,
661
+ "loss": 0.1939,
662
+ "step": 46000
663
+ },
664
+ {
665
+ "epoch": 1.4960025737955442,
666
+ "grad_norm": 1.1803226470947266,
667
+ "learning_rate": 1.2612605366450037e-05,
668
+ "loss": 0.1887,
669
+ "step": 46500
670
+ },
671
+ {
672
+ "epoch": 1.5120887959462719,
673
+ "grad_norm": 1.1462236642837524,
674
+ "learning_rate": 1.2210443343414194e-05,
675
+ "loss": 0.1883,
676
+ "step": 47000
677
+ },
678
+ {
679
+ "epoch": 1.528175018097,
680
+ "grad_norm": 0.8483968377113342,
681
+ "learning_rate": 1.1808281320378355e-05,
682
+ "loss": 0.1809,
683
+ "step": 47500
684
+ },
685
+ {
686
+ "epoch": 1.5442612402477278,
687
+ "grad_norm": 1.1205823421478271,
688
+ "learning_rate": 1.1406119297342515e-05,
689
+ "loss": 0.1813,
690
+ "step": 48000
691
+ },
692
+ {
693
+ "epoch": 1.5603474623984557,
694
+ "grad_norm": 1.417622447013855,
695
+ "learning_rate": 1.1003957274306672e-05,
696
+ "loss": 0.1788,
697
+ "step": 48500
698
+ },
699
+ {
700
+ "epoch": 1.5764336845491838,
701
+ "grad_norm": 1.179103970527649,
702
+ "learning_rate": 1.0602599575316904e-05,
703
+ "loss": 0.1809,
704
+ "step": 49000
705
+ },
706
+ {
707
+ "epoch": 1.5925199066999114,
708
+ "grad_norm": 1.1092889308929443,
709
+ "learning_rate": 1.0200437552281065e-05,
710
+ "loss": 0.1734,
711
+ "step": 49500
712
+ },
713
+ {
714
+ "epoch": 1.6086061288506395,
715
+ "grad_norm": 1.0196574926376343,
716
+ "learning_rate": 9.798275529245222e-06,
717
+ "loss": 0.1688,
718
+ "step": 50000
719
+ },
720
+ {
721
+ "epoch": 1.6246923510013673,
722
+ "grad_norm": 1.1376862525939941,
723
+ "learning_rate": 9.396113506209382e-06,
724
+ "loss": 0.1703,
725
+ "step": 50500
726
+ },
727
+ {
728
+ "epoch": 1.6407785731520952,
729
+ "grad_norm": 0.8885149955749512,
730
+ "learning_rate": 8.995560131265685e-06,
731
+ "loss": 0.1691,
732
+ "step": 51000
733
+ },
734
+ {
735
+ "epoch": 1.6568647953028233,
736
+ "grad_norm": 1.2574944496154785,
737
+ "learning_rate": 8.593398108229844e-06,
738
+ "loss": 0.1615,
739
+ "step": 51500
740
+ },
741
+ {
742
+ "epoch": 1.672951017453551,
743
+ "grad_norm": 1.2620723247528076,
744
+ "learning_rate": 8.191236085194004e-06,
745
+ "loss": 0.1593,
746
+ "step": 52000
747
+ },
748
+ {
749
+ "epoch": 1.689037239604279,
750
+ "grad_norm": 1.551480770111084,
751
+ "learning_rate": 7.789074062158163e-06,
752
+ "loss": 0.1639,
753
+ "step": 52500
754
+ },
755
+ {
756
+ "epoch": 1.7051234617550068,
757
+ "grad_norm": 1.5938962697982788,
758
+ "learning_rate": 7.386912039122322e-06,
759
+ "loss": 0.1587,
760
+ "step": 53000
761
+ },
762
+ {
763
+ "epoch": 1.7212096839057347,
764
+ "grad_norm": 1.0503953695297241,
765
+ "learning_rate": 6.984750016086482e-06,
766
+ "loss": 0.1599,
767
+ "step": 53500
768
+ },
769
+ {
770
+ "epoch": 1.7372959060564628,
771
+ "grad_norm": 1.1205036640167236,
772
+ "learning_rate": 6.583392317096712e-06,
773
+ "loss": 0.1541,
774
+ "step": 54000
775
+ },
776
+ {
777
+ "epoch": 1.7533821282071904,
778
+ "grad_norm": 0.7524433732032776,
779
+ "learning_rate": 6.181230294060872e-06,
780
+ "loss": 0.1521,
781
+ "step": 54500
782
+ },
783
+ {
784
+ "epoch": 1.7694683503579185,
785
+ "grad_norm": 0.9619775414466858,
786
+ "learning_rate": 5.779068271025031e-06,
787
+ "loss": 0.1521,
788
+ "step": 55000
789
+ },
790
+ {
791
+ "epoch": 1.7855545725086464,
792
+ "grad_norm": 0.9406844973564148,
793
+ "learning_rate": 5.37690624798919e-06,
794
+ "loss": 0.1509,
795
+ "step": 55500
796
+ },
797
+ {
798
+ "epoch": 1.8016407946593742,
799
+ "grad_norm": 0.9363726377487183,
800
+ "learning_rate": 4.975548548999421e-06,
801
+ "loss": 0.1513,
802
+ "step": 56000
803
+ },
804
+ {
805
+ "epoch": 1.8177270168101023,
806
+ "grad_norm": 0.9941402673721313,
807
+ "learning_rate": 4.573386525963581e-06,
808
+ "loss": 0.1484,
809
+ "step": 56500
810
+ },
811
+ {
812
+ "epoch": 1.83381323896083,
813
+ "grad_norm": 1.3756345510482788,
814
+ "learning_rate": 4.17122450292774e-06,
815
+ "loss": 0.1509,
816
+ "step": 57000
817
+ },
818
+ {
819
+ "epoch": 1.849899461111558,
820
+ "grad_norm": 1.0644595623016357,
821
+ "learning_rate": 3.7690624798918986e-06,
822
+ "loss": 0.1486,
823
+ "step": 57500
824
+ },
825
+ {
826
+ "epoch": 1.8659856832622859,
827
+ "grad_norm": 1.070890188217163,
828
+ "learning_rate": 3.3669004568560584e-06,
829
+ "loss": 0.1462,
830
+ "step": 58000
831
+ },
832
+ {
833
+ "epoch": 1.8820719054130137,
834
+ "grad_norm": 1.3034768104553223,
835
+ "learning_rate": 2.9647384338202173e-06,
836
+ "loss": 0.1481,
837
+ "step": 58500
838
+ },
839
+ {
840
+ "epoch": 1.8981581275637418,
841
+ "grad_norm": 1.127517580986023,
842
+ "learning_rate": 2.5625764107843767e-06,
843
+ "loss": 0.1451,
844
+ "step": 59000
845
+ },
846
+ {
847
+ "epoch": 1.9142443497144694,
848
+ "grad_norm": 0.9431403279304504,
849
+ "learning_rate": 2.1604143877485364e-06,
850
+ "loss": 0.1458,
851
+ "step": 59500
852
+ },
853
+ {
854
+ "epoch": 1.9303305718651975,
855
+ "grad_norm": 1.271483302116394,
856
+ "learning_rate": 1.7590566887587673e-06,
857
+ "loss": 0.1463,
858
+ "step": 60000
859
+ },
860
+ {
861
+ "epoch": 1.9464167940159254,
862
+ "grad_norm": 0.7327952980995178,
863
+ "learning_rate": 1.3568946657229264e-06,
864
+ "loss": 0.1434,
865
+ "step": 60500
866
+ },
867
+ {
868
+ "epoch": 1.9625030161666532,
869
+ "grad_norm": 1.0670543909072876,
870
+ "learning_rate": 9.547326426870858e-07,
871
+ "loss": 0.1424,
872
+ "step": 61000
873
+ },
874
+ {
875
+ "epoch": 1.9785892383173813,
876
+ "grad_norm": 1.2705425024032593,
877
+ "learning_rate": 5.525706196512451e-07,
878
+ "loss": 0.1431,
879
+ "step": 61500
880
+ }
881
+ ],
882
+ "logging_steps": 500,
883
+ "max_steps": 62164,
884
+ "num_input_tokens_seen": 0,
885
+ "num_train_epochs": 2,
886
+ "save_steps": 500,
887
+ "stateful_callbacks": {
888
+ "TrainerControl": {
889
+ "args": {
890
+ "should_epoch_stop": false,
891
+ "should_evaluate": false,
892
+ "should_log": false,
893
+ "should_save": true,
894
+ "should_training_stop": false
895
+ },
896
+ "attributes": {}
897
+ }
898
+ },
899
+ "total_flos": 1.3317420101507482e+17,
900
+ "train_batch_size": 32,
901
+ "trial_name": null,
902
+ "trial_params": null
903
+ }
checkpoints/{checkpoint-57000 → checkpoint-61500}/training_args.bin RENAMED
File without changes
checkpoints/{checkpoint-57500 → checkpoint-62000}/config.json RENAMED
File without changes
checkpoints/{checkpoint-57500 → checkpoint-62000}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-57500 → checkpoint-62000}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6eedfde9e773f9378b18dc1b9648c00ad912c38ace530092373cda3d1fb2874
3
  size 258368552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d634b9fb199c6f1740da63ed5e87bdff20b813c9bbfbdb4c438dd27d4ab4a43
3
  size 258368552
checkpoints/{checkpoint-57000 → checkpoint-62000}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ab390d937f3438596f5b73e90523ee2feab36d34cef8954dd79274323c18190
3
  size 516816826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92792cccad0cffd7b1d855e6c4fcd33ba25e1aa7b12ae985fc20d5c190b15298
3
  size 516816826
checkpoints/{checkpoint-57500 → checkpoint-62000}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b895e2e6f2678d5f15acd23697cb989b4374303c17e1529ff920b8dfe800feb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82cb63ee008746fe97e2f989e0faf59793321720b17b5d1bb583c8252b44d2dd
3
  size 14244
checkpoints/{checkpoint-57500 → checkpoint-62000}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d2e6b1bad6b3de955c41a51000e593dfbb10ebc3fb750400aa7333bf03f6a68
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b157befbc8b431a674c196f6e4cac76d2908e26ebf928dd38c5fd8e80ccd7dfb
3
  size 988
checkpoints/{checkpoint-57726 → checkpoint-62000}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4da1bb3f7f646b57c9e2d03ece470d97442e16e9ace2394b979836215ed5a8c7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82a779fd4c5c40ea80ef0219096be07750b466b66cb885a86a58a8d3e8a2829d
3
  size 1064
checkpoints/{checkpoint-57500 → checkpoint-62000}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-57000 → checkpoint-62000}/spiece.model RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deb4456f78071a8ab6ca5a1698b8a196823b52dea960f8bb0cbcbe082828ead4
3
- size 1040899
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610d79d092a886e9af7d31b94512c46fd20a5f62557bd5b8d8a5b23f1f78650a
3
+ size 1042971
checkpoints/{checkpoint-57500 → checkpoint-62000}/tokenizer.json RENAMED
The diff for this file is too large to render. See raw diff
 
checkpoints/{checkpoint-57500 → checkpoint-62000}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-62000/trainer_state.json ADDED
@@ -0,0 +1,910 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.994675460468109,
6
+ "eval_steps": 500,
7
+ "global_step": 62000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016086222150727903,
14
+ "grad_norm": 0.5078127384185791,
15
+ "learning_rate": 4.959864230101023e-05,
16
+ "loss": 2.1432,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.032172444301455806,
21
+ "grad_norm": 0.4508506655693054,
22
+ "learning_rate": 4.9196480277974395e-05,
23
+ "loss": 1.9093,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.048258666452183706,
28
+ "grad_norm": 0.4430558979511261,
29
+ "learning_rate": 4.879431825493855e-05,
30
+ "loss": 1.8418,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.06434488860291161,
35
+ "grad_norm": 0.4775325059890747,
36
+ "learning_rate": 4.8392156231902713e-05,
37
+ "loss": 1.7771,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.08043111075363951,
42
+ "grad_norm": 0.49685001373291016,
43
+ "learning_rate": 4.7989994208866876e-05,
44
+ "loss": 1.7226,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.09651733290436741,
49
+ "grad_norm": 0.5552434325218201,
50
+ "learning_rate": 4.7587832185831025e-05,
51
+ "loss": 1.6767,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.11260355505509531,
56
+ "grad_norm": 0.6779139637947083,
57
+ "learning_rate": 4.718567016279519e-05,
58
+ "loss": 1.6588,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.12868977720582322,
63
+ "grad_norm": 0.5552022457122803,
64
+ "learning_rate": 4.6783508139759344e-05,
65
+ "loss": 1.603,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.1447759993565511,
70
+ "grad_norm": 0.5302042365074158,
71
+ "learning_rate": 4.638134611672351e-05,
72
+ "loss": 1.5776,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.16086222150727902,
77
+ "grad_norm": 0.5810815691947937,
78
+ "learning_rate": 4.597918409368766e-05,
79
+ "loss": 1.5333,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 0.1769484436580069,
84
+ "grad_norm": 0.5819700956344604,
85
+ "learning_rate": 4.5577022070651826e-05,
86
+ "loss": 1.5168,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 0.19303466580873482,
91
+ "grad_norm": 0.6134072542190552,
92
+ "learning_rate": 4.517486004761599e-05,
93
+ "loss": 1.4748,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 0.2091208879594627,
98
+ "grad_norm": 0.5746152400970459,
99
+ "learning_rate": 4.4772698024580144e-05,
100
+ "loss": 1.4622,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 0.22520711011019062,
105
+ "grad_norm": 0.7663710713386536,
106
+ "learning_rate": 4.437053600154431e-05,
107
+ "loss": 1.4767,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 0.24129333226091854,
112
+ "grad_norm": 0.7993176579475403,
113
+ "learning_rate": 4.396837397850846e-05,
114
+ "loss": 1.4527,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 0.25737955441164645,
119
+ "grad_norm": 0.6892676949501038,
120
+ "learning_rate": 4.3566211955472626e-05,
121
+ "loss": 1.4325,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 0.2734657765623743,
126
+ "grad_norm": 0.6928556561470032,
127
+ "learning_rate": 4.316404993243678e-05,
128
+ "loss": 1.4038,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 0.2895519987131022,
133
+ "grad_norm": 0.7578593492507935,
134
+ "learning_rate": 4.2761887909400944e-05,
135
+ "loss": 1.3945,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 0.30563822086383013,
140
+ "grad_norm": 0.7504703402519226,
141
+ "learning_rate": 4.23597258863651e-05,
142
+ "loss": 1.3644,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 0.32172444301455805,
147
+ "grad_norm": 0.8370710611343384,
148
+ "learning_rate": 4.1957563863329256e-05,
149
+ "loss": 1.3619,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 0.3378106651652859,
154
+ "grad_norm": 0.8501142263412476,
155
+ "learning_rate": 4.155540184029342e-05,
156
+ "loss": 1.3448,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 0.3538968873160138,
161
+ "grad_norm": 0.9001900553703308,
162
+ "learning_rate": 4.1153239817257575e-05,
163
+ "loss": 1.3004,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 0.36998310946674173,
168
+ "grad_norm": 1.0658681392669678,
169
+ "learning_rate": 4.075107779422174e-05,
170
+ "loss": 1.2789,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 0.38606933161746965,
175
+ "grad_norm": 1.1038371324539185,
176
+ "learning_rate": 4.0348915771185894e-05,
177
+ "loss": 1.2651,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 0.40215555376819756,
182
+ "grad_norm": 1.2004213333129883,
183
+ "learning_rate": 3.994755807219613e-05,
184
+ "loss": 1.2216,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 0.4182417759189254,
189
+ "grad_norm": 1.235543966293335,
190
+ "learning_rate": 3.9545396049160286e-05,
191
+ "loss": 1.1955,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 0.43432799806965333,
196
+ "grad_norm": 1.5088828802108765,
197
+ "learning_rate": 3.914323402612445e-05,
198
+ "loss": 1.1836,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 0.45041422022038125,
203
+ "grad_norm": 1.264153242111206,
204
+ "learning_rate": 3.8741072003088605e-05,
205
+ "loss": 1.1658,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 0.46650044237110916,
210
+ "grad_norm": 1.3023343086242676,
211
+ "learning_rate": 3.833971430409884e-05,
212
+ "loss": 1.1481,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 0.48258666452183707,
217
+ "grad_norm": 1.3824670314788818,
218
+ "learning_rate": 3.7938356605109064e-05,
219
+ "loss": 1.1221,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 0.49867288667256493,
224
+ "grad_norm": 1.4364969730377197,
225
+ "learning_rate": 3.75369989061193e-05,
226
+ "loss": 1.1057,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 0.5147591088232929,
231
+ "grad_norm": 2.051701545715332,
232
+ "learning_rate": 3.7134836883083456e-05,
233
+ "loss": 1.0873,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 0.5308453309740208,
238
+ "grad_norm": 1.4329720735549927,
239
+ "learning_rate": 3.673267486004762e-05,
240
+ "loss": 1.0607,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 0.5469315531247486,
245
+ "grad_norm": 1.4981014728546143,
246
+ "learning_rate": 3.6330512837011775e-05,
247
+ "loss": 1.0516,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 0.5630177752754766,
252
+ "grad_norm": 1.3012079000473022,
253
+ "learning_rate": 3.592835081397594e-05,
254
+ "loss": 1.0317,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 0.5791039974262044,
259
+ "grad_norm": 1.401825189590454,
260
+ "learning_rate": 3.552699311498617e-05,
261
+ "loss": 1.0183,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 0.5951902195769324,
266
+ "grad_norm": 2.0783369541168213,
267
+ "learning_rate": 3.512483109195033e-05,
268
+ "loss": 0.9985,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 0.6112764417276603,
273
+ "grad_norm": 2.3940794467926025,
274
+ "learning_rate": 3.4722669068914486e-05,
275
+ "loss": 0.9698,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 0.6273626638783881,
280
+ "grad_norm": 1.4747998714447021,
281
+ "learning_rate": 3.432050704587865e-05,
282
+ "loss": 0.9657,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 0.6434488860291161,
287
+ "grad_norm": 3.0782012939453125,
288
+ "learning_rate": 3.391914934688888e-05,
289
+ "loss": 0.9379,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 0.659535108179844,
294
+ "grad_norm": 2.4914307594299316,
295
+ "learning_rate": 3.3516987323853034e-05,
296
+ "loss": 0.915,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 0.6756213303305718,
301
+ "grad_norm": 2.772120237350464,
302
+ "learning_rate": 3.3115629624863264e-05,
303
+ "loss": 0.9047,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 0.6917075524812998,
308
+ "grad_norm": 2.519575595855713,
309
+ "learning_rate": 3.271346760182743e-05,
310
+ "loss": 0.8688,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 0.7077937746320276,
315
+ "grad_norm": 4.085098743438721,
316
+ "learning_rate": 3.231130557879158e-05,
317
+ "loss": 0.8581,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 0.7238799967827556,
322
+ "grad_norm": 1.4670002460479736,
323
+ "learning_rate": 3.1909143555755745e-05,
324
+ "loss": 0.8354,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 0.7399662189334835,
329
+ "grad_norm": 2.4749488830566406,
330
+ "learning_rate": 3.1507785856765975e-05,
331
+ "loss": 0.8108,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 0.7560524410842113,
336
+ "grad_norm": 1.8635029792785645,
337
+ "learning_rate": 3.110562383373014e-05,
338
+ "loss": 0.7773,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 0.7721386632349393,
343
+ "grad_norm": 3.5713748931884766,
344
+ "learning_rate": 3.0703461810694294e-05,
345
+ "loss": 0.756,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 0.7882248853856672,
350
+ "grad_norm": 1.8903526067733765,
351
+ "learning_rate": 3.0301299787658456e-05,
352
+ "loss": 0.7326,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 0.8043111075363951,
357
+ "grad_norm": 8.286703109741211,
358
+ "learning_rate": 2.9899942088668686e-05,
359
+ "loss": 0.6948,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 0.820397329687123,
364
+ "grad_norm": 2.2209272384643555,
365
+ "learning_rate": 2.9497780065632845e-05,
366
+ "loss": 0.6914,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 0.8364835518378508,
371
+ "grad_norm": 2.2284536361694336,
372
+ "learning_rate": 2.9095618042597e-05,
373
+ "loss": 0.6585,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 0.8525697739885788,
378
+ "grad_norm": 3.4615938663482666,
379
+ "learning_rate": 2.869345601956116e-05,
380
+ "loss": 0.633,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 0.8686559961393067,
385
+ "grad_norm": 3.1158838272094727,
386
+ "learning_rate": 2.829209832057139e-05,
387
+ "loss": 0.6181,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 0.8847422182900346,
392
+ "grad_norm": 2.3320417404174805,
393
+ "learning_rate": 2.7889936297535553e-05,
394
+ "loss": 0.5993,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 0.9008284404407625,
399
+ "grad_norm": 1.8331427574157715,
400
+ "learning_rate": 2.7487774274499712e-05,
401
+ "loss": 0.5839,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 0.9169146625914903,
406
+ "grad_norm": 3.2398369312286377,
407
+ "learning_rate": 2.708561225146387e-05,
408
+ "loss": 0.562,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 0.9330008847422183,
413
+ "grad_norm": 1.6575061082839966,
414
+ "learning_rate": 2.66842545524741e-05,
415
+ "loss": 0.5313,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 0.9490871068929462,
420
+ "grad_norm": 2.1604230403900146,
421
+ "learning_rate": 2.6282092529438264e-05,
422
+ "loss": 0.5203,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 0.9651733290436741,
427
+ "grad_norm": 3.3743808269500732,
428
+ "learning_rate": 2.5879930506402423e-05,
429
+ "loss": 0.4938,
430
+ "step": 30000
431
+ },
432
+ {
433
+ "epoch": 0.981259551194402,
434
+ "grad_norm": 3.766514301300049,
435
+ "learning_rate": 2.5477768483366583e-05,
436
+ "loss": 0.4724,
437
+ "step": 30500
438
+ },
439
+ {
440
+ "epoch": 0.9973457733451299,
441
+ "grad_norm": 2.26712703704834,
442
+ "learning_rate": 2.5075606460330742e-05,
443
+ "loss": 0.4656,
444
+ "step": 31000
445
+ },
446
+ {
447
+ "epoch": 1.0,
448
+ "eval_loss": 0.26554691791534424,
449
+ "eval_runtime": 1917.4803,
450
+ "eval_samples_per_second": 345.81,
451
+ "eval_steps_per_second": 43.227,
452
+ "step": 31083
453
+ },
454
+ {
455
+ "epoch": 1.0134159092737072,
456
+ "grad_norm": 2.1041958332061768,
457
+ "learning_rate": 2.467424876134097e-05,
458
+ "loss": 0.4381,
459
+ "step": 31500
460
+ },
461
+ {
462
+ "epoch": 1.029502131424435,
463
+ "grad_norm": 1.7629106044769287,
464
+ "learning_rate": 2.427208673830513e-05,
465
+ "loss": 0.4298,
466
+ "step": 32000
467
+ },
468
+ {
469
+ "epoch": 1.0455883535751629,
470
+ "grad_norm": 2.5032904148101807,
471
+ "learning_rate": 2.386992471526929e-05,
472
+ "loss": 0.4188,
473
+ "step": 32500
474
+ },
475
+ {
476
+ "epoch": 1.0616745757258907,
477
+ "grad_norm": 1.6467881202697754,
478
+ "learning_rate": 2.3467762692233446e-05,
479
+ "loss": 0.3986,
480
+ "step": 33000
481
+ },
482
+ {
483
+ "epoch": 1.0777607978766186,
484
+ "grad_norm": 1.957220435142517,
485
+ "learning_rate": 2.3065600669197606e-05,
486
+ "loss": 0.382,
487
+ "step": 33500
488
+ },
489
+ {
490
+ "epoch": 1.0938470200273467,
491
+ "grad_norm": 1.6566946506500244,
492
+ "learning_rate": 2.2663438646161765e-05,
493
+ "loss": 0.3689,
494
+ "step": 34000
495
+ },
496
+ {
497
+ "epoch": 1.1099332421780745,
498
+ "grad_norm": 2.081613540649414,
499
+ "learning_rate": 2.2261276623125928e-05,
500
+ "loss": 0.3603,
501
+ "step": 34500
502
+ },
503
+ {
504
+ "epoch": 1.1260194643288024,
505
+ "grad_norm": 2.155226945877075,
506
+ "learning_rate": 2.1859918924136157e-05,
507
+ "loss": 0.3478,
508
+ "step": 35000
509
+ },
510
+ {
511
+ "epoch": 1.1421056864795303,
512
+ "grad_norm": 1.9459590911865234,
513
+ "learning_rate": 2.1457756901100317e-05,
514
+ "loss": 0.3315,
515
+ "step": 35500
516
+ },
517
+ {
518
+ "epoch": 1.1581919086302581,
519
+ "grad_norm": 2.3381567001342773,
520
+ "learning_rate": 2.1055594878064476e-05,
521
+ "loss": 0.3259,
522
+ "step": 36000
523
+ },
524
+ {
525
+ "epoch": 1.1742781307809862,
526
+ "grad_norm": 1.4302254915237427,
527
+ "learning_rate": 2.0653432855028635e-05,
528
+ "loss": 0.3168,
529
+ "step": 36500
530
+ },
531
+ {
532
+ "epoch": 1.190364352931714,
533
+ "grad_norm": 1.1770597696304321,
534
+ "learning_rate": 2.0251270831992795e-05,
535
+ "loss": 0.3082,
536
+ "step": 37000
537
+ },
538
+ {
539
+ "epoch": 1.206450575082442,
540
+ "grad_norm": 1.7475298643112183,
541
+ "learning_rate": 1.9849913133003024e-05,
542
+ "loss": 0.3014,
543
+ "step": 37500
544
+ },
545
+ {
546
+ "epoch": 1.2225367972331698,
547
+ "grad_norm": 1.2397468090057373,
548
+ "learning_rate": 1.9447751109967187e-05,
549
+ "loss": 0.288,
550
+ "step": 38000
551
+ },
552
+ {
553
+ "epoch": 1.2386230193838976,
554
+ "grad_norm": 1.6603740453720093,
555
+ "learning_rate": 1.9045589086931343e-05,
556
+ "loss": 0.2797,
557
+ "step": 38500
558
+ },
559
+ {
560
+ "epoch": 1.2547092415346257,
561
+ "grad_norm": 1.7009538412094116,
562
+ "learning_rate": 1.8643427063895502e-05,
563
+ "loss": 0.275,
564
+ "step": 39000
565
+ },
566
+ {
567
+ "epoch": 1.2707954636853536,
568
+ "grad_norm": 1.4941717386245728,
569
+ "learning_rate": 1.8241265040859662e-05,
570
+ "loss": 0.2623,
571
+ "step": 39500
572
+ },
573
+ {
574
+ "epoch": 1.2868816858360814,
575
+ "grad_norm": 1.941115140914917,
576
+ "learning_rate": 1.7839907341869895e-05,
577
+ "loss": 0.2572,
578
+ "step": 40000
579
+ },
580
+ {
581
+ "epoch": 1.3029679079868093,
582
+ "grad_norm": 1.487726092338562,
583
+ "learning_rate": 1.7437745318834054e-05,
584
+ "loss": 0.2502,
585
+ "step": 40500
586
+ },
587
+ {
588
+ "epoch": 1.3190541301375371,
589
+ "grad_norm": 1.4628674983978271,
590
+ "learning_rate": 1.7035583295798213e-05,
591
+ "loss": 0.2437,
592
+ "step": 41000
593
+ },
594
+ {
595
+ "epoch": 1.3351403522882652,
596
+ "grad_norm": 1.401607632637024,
597
+ "learning_rate": 1.663342127276237e-05,
598
+ "loss": 0.2421,
599
+ "step": 41500
600
+ },
601
+ {
602
+ "epoch": 1.351226574438993,
603
+ "grad_norm": 1.1497563123703003,
604
+ "learning_rate": 1.623125924972653e-05,
605
+ "loss": 0.231,
606
+ "step": 42000
607
+ },
608
+ {
609
+ "epoch": 1.367312796589721,
610
+ "grad_norm": 1.322836995124817,
611
+ "learning_rate": 1.5829097226690688e-05,
612
+ "loss": 0.2261,
613
+ "step": 42500
614
+ },
615
+ {
616
+ "epoch": 1.3833990187404488,
617
+ "grad_norm": 1.5328525304794312,
618
+ "learning_rate": 1.542773952770092e-05,
619
+ "loss": 0.2177,
620
+ "step": 43000
621
+ },
622
+ {
623
+ "epoch": 1.3994852408911767,
624
+ "grad_norm": 1.7748241424560547,
625
+ "learning_rate": 1.502557750466508e-05,
626
+ "loss": 0.2186,
627
+ "step": 43500
628
+ },
629
+ {
630
+ "epoch": 1.4155714630419047,
631
+ "grad_norm": 1.6542141437530518,
632
+ "learning_rate": 1.4623415481629241e-05,
633
+ "loss": 0.2138,
634
+ "step": 44000
635
+ },
636
+ {
637
+ "epoch": 1.4316576851926326,
638
+ "grad_norm": 1.3098843097686768,
639
+ "learning_rate": 1.4221253458593397e-05,
640
+ "loss": 0.211,
641
+ "step": 44500
642
+ },
643
+ {
644
+ "epoch": 1.4477439073433604,
645
+ "grad_norm": 1.345651626586914,
646
+ "learning_rate": 1.3819091435557557e-05,
647
+ "loss": 0.2027,
648
+ "step": 45000
649
+ },
650
+ {
651
+ "epoch": 1.4638301294940883,
652
+ "grad_norm": 1.4520297050476074,
653
+ "learning_rate": 1.3416929412521718e-05,
654
+ "loss": 0.2039,
655
+ "step": 45500
656
+ },
657
+ {
658
+ "epoch": 1.4799163516448162,
659
+ "grad_norm": 1.5913499593734741,
660
+ "learning_rate": 1.3014767389485877e-05,
661
+ "loss": 0.1939,
662
+ "step": 46000
663
+ },
664
+ {
665
+ "epoch": 1.4960025737955442,
666
+ "grad_norm": 1.1803226470947266,
667
+ "learning_rate": 1.2612605366450037e-05,
668
+ "loss": 0.1887,
669
+ "step": 46500
670
+ },
671
+ {
672
+ "epoch": 1.5120887959462719,
673
+ "grad_norm": 1.1462236642837524,
674
+ "learning_rate": 1.2210443343414194e-05,
675
+ "loss": 0.1883,
676
+ "step": 47000
677
+ },
678
+ {
679
+ "epoch": 1.528175018097,
680
+ "grad_norm": 0.8483968377113342,
681
+ "learning_rate": 1.1808281320378355e-05,
682
+ "loss": 0.1809,
683
+ "step": 47500
684
+ },
685
+ {
686
+ "epoch": 1.5442612402477278,
687
+ "grad_norm": 1.1205823421478271,
688
+ "learning_rate": 1.1406119297342515e-05,
689
+ "loss": 0.1813,
690
+ "step": 48000
691
+ },
692
+ {
693
+ "epoch": 1.5603474623984557,
694
+ "grad_norm": 1.417622447013855,
695
+ "learning_rate": 1.1003957274306672e-05,
696
+ "loss": 0.1788,
697
+ "step": 48500
698
+ },
699
+ {
700
+ "epoch": 1.5764336845491838,
701
+ "grad_norm": 1.179103970527649,
702
+ "learning_rate": 1.0602599575316904e-05,
703
+ "loss": 0.1809,
704
+ "step": 49000
705
+ },
706
+ {
707
+ "epoch": 1.5925199066999114,
708
+ "grad_norm": 1.1092889308929443,
709
+ "learning_rate": 1.0200437552281065e-05,
710
+ "loss": 0.1734,
711
+ "step": 49500
712
+ },
713
+ {
714
+ "epoch": 1.6086061288506395,
715
+ "grad_norm": 1.0196574926376343,
716
+ "learning_rate": 9.798275529245222e-06,
717
+ "loss": 0.1688,
718
+ "step": 50000
719
+ },
720
+ {
721
+ "epoch": 1.6246923510013673,
722
+ "grad_norm": 1.1376862525939941,
723
+ "learning_rate": 9.396113506209382e-06,
724
+ "loss": 0.1703,
725
+ "step": 50500
726
+ },
727
+ {
728
+ "epoch": 1.6407785731520952,
729
+ "grad_norm": 0.8885149955749512,
730
+ "learning_rate": 8.995560131265685e-06,
731
+ "loss": 0.1691,
732
+ "step": 51000
733
+ },
734
+ {
735
+ "epoch": 1.6568647953028233,
736
+ "grad_norm": 1.2574944496154785,
737
+ "learning_rate": 8.593398108229844e-06,
738
+ "loss": 0.1615,
739
+ "step": 51500
740
+ },
741
+ {
742
+ "epoch": 1.672951017453551,
743
+ "grad_norm": 1.2620723247528076,
744
+ "learning_rate": 8.191236085194004e-06,
745
+ "loss": 0.1593,
746
+ "step": 52000
747
+ },
748
+ {
749
+ "epoch": 1.689037239604279,
750
+ "grad_norm": 1.551480770111084,
751
+ "learning_rate": 7.789074062158163e-06,
752
+ "loss": 0.1639,
753
+ "step": 52500
754
+ },
755
+ {
756
+ "epoch": 1.7051234617550068,
757
+ "grad_norm": 1.5938962697982788,
758
+ "learning_rate": 7.386912039122322e-06,
759
+ "loss": 0.1587,
760
+ "step": 53000
761
+ },
762
+ {
763
+ "epoch": 1.7212096839057347,
764
+ "grad_norm": 1.0503953695297241,
765
+ "learning_rate": 6.984750016086482e-06,
766
+ "loss": 0.1599,
767
+ "step": 53500
768
+ },
769
+ {
770
+ "epoch": 1.7372959060564628,
771
+ "grad_norm": 1.1205036640167236,
772
+ "learning_rate": 6.583392317096712e-06,
773
+ "loss": 0.1541,
774
+ "step": 54000
775
+ },
776
+ {
777
+ "epoch": 1.7533821282071904,
778
+ "grad_norm": 0.7524433732032776,
779
+ "learning_rate": 6.181230294060872e-06,
780
+ "loss": 0.1521,
781
+ "step": 54500
782
+ },
783
+ {
784
+ "epoch": 1.7694683503579185,
785
+ "grad_norm": 0.9619775414466858,
786
+ "learning_rate": 5.779068271025031e-06,
787
+ "loss": 0.1521,
788
+ "step": 55000
789
+ },
790
+ {
791
+ "epoch": 1.7855545725086464,
792
+ "grad_norm": 0.9406844973564148,
793
+ "learning_rate": 5.37690624798919e-06,
794
+ "loss": 0.1509,
795
+ "step": 55500
796
+ },
797
+ {
798
+ "epoch": 1.8016407946593742,
799
+ "grad_norm": 0.9363726377487183,
800
+ "learning_rate": 4.975548548999421e-06,
801
+ "loss": 0.1513,
802
+ "step": 56000
803
+ },
804
+ {
805
+ "epoch": 1.8177270168101023,
806
+ "grad_norm": 0.9941402673721313,
807
+ "learning_rate": 4.573386525963581e-06,
808
+ "loss": 0.1484,
809
+ "step": 56500
810
+ },
811
+ {
812
+ "epoch": 1.83381323896083,
813
+ "grad_norm": 1.3756345510482788,
814
+ "learning_rate": 4.17122450292774e-06,
815
+ "loss": 0.1509,
816
+ "step": 57000
817
+ },
818
+ {
819
+ "epoch": 1.849899461111558,
820
+ "grad_norm": 1.0644595623016357,
821
+ "learning_rate": 3.7690624798918986e-06,
822
+ "loss": 0.1486,
823
+ "step": 57500
824
+ },
825
+ {
826
+ "epoch": 1.8659856832622859,
827
+ "grad_norm": 1.070890188217163,
828
+ "learning_rate": 3.3669004568560584e-06,
829
+ "loss": 0.1462,
830
+ "step": 58000
831
+ },
832
+ {
833
+ "epoch": 1.8820719054130137,
834
+ "grad_norm": 1.3034768104553223,
835
+ "learning_rate": 2.9647384338202173e-06,
836
+ "loss": 0.1481,
837
+ "step": 58500
838
+ },
839
+ {
840
+ "epoch": 1.8981581275637418,
841
+ "grad_norm": 1.127517580986023,
842
+ "learning_rate": 2.5625764107843767e-06,
843
+ "loss": 0.1451,
844
+ "step": 59000
845
+ },
846
+ {
847
+ "epoch": 1.9142443497144694,
848
+ "grad_norm": 0.9431403279304504,
849
+ "learning_rate": 2.1604143877485364e-06,
850
+ "loss": 0.1458,
851
+ "step": 59500
852
+ },
853
+ {
854
+ "epoch": 1.9303305718651975,
855
+ "grad_norm": 1.271483302116394,
856
+ "learning_rate": 1.7590566887587673e-06,
857
+ "loss": 0.1463,
858
+ "step": 60000
859
+ },
860
+ {
861
+ "epoch": 1.9464167940159254,
862
+ "grad_norm": 0.7327952980995178,
863
+ "learning_rate": 1.3568946657229264e-06,
864
+ "loss": 0.1434,
865
+ "step": 60500
866
+ },
867
+ {
868
+ "epoch": 1.9625030161666532,
869
+ "grad_norm": 1.0670543909072876,
870
+ "learning_rate": 9.547326426870858e-07,
871
+ "loss": 0.1424,
872
+ "step": 61000
873
+ },
874
+ {
875
+ "epoch": 1.9785892383173813,
876
+ "grad_norm": 1.2705425024032593,
877
+ "learning_rate": 5.525706196512451e-07,
878
+ "loss": 0.1431,
879
+ "step": 61500
880
+ },
881
+ {
882
+ "epoch": 1.994675460468109,
883
+ "grad_norm": 0.9267213344573975,
884
+ "learning_rate": 1.5040859661540443e-07,
885
+ "loss": 0.1418,
886
+ "step": 62000
887
+ }
888
+ ],
889
+ "logging_steps": 500,
890
+ "max_steps": 62164,
891
+ "num_input_tokens_seen": 0,
892
+ "num_train_epochs": 2,
893
+ "save_steps": 500,
894
+ "stateful_callbacks": {
895
+ "TrainerControl": {
896
+ "args": {
897
+ "should_epoch_stop": false,
898
+ "should_evaluate": false,
899
+ "should_log": false,
900
+ "should_save": true,
901
+ "should_training_stop": false
902
+ },
903
+ "attributes": {}
904
+ }
905
+ },
906
+ "total_flos": 1.3425693542685082e+17,
907
+ "train_batch_size": 32,
908
+ "trial_name": null,
909
+ "trial_params": null
910
+ }
checkpoints/{checkpoint-57500 → checkpoint-62000}/training_args.bin RENAMED
File without changes
checkpoints/{checkpoint-57726 → checkpoint-62164}/config.json RENAMED
File without changes
checkpoints/{checkpoint-57726 → checkpoint-62164}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-57726 → checkpoint-62164}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:301a885d1d6deadcc8cd3f69d48c233773d17dfd7a9248c17b8ac07c0d7ad7bc
3
  size 258368552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e8c8770f6dcbe1b9d9475af0bda9b1c3d8b183e7518931c4ee48457079834ba
3
  size 258368552
checkpoints/{checkpoint-57726 → checkpoint-62164}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3b47db0706750c087efba4d0ceacca7b5fcfbf20d510fef040aa06f8e2013a9
3
  size 516816826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd427ed1567c5a01aba6a5e07e5f19d0d2b1351120d29cd57c3503632243f624
3
  size 516816826
checkpoints/{checkpoint-57000 → checkpoint-62164}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2b2d2320fa9ef0370a3c13fa0f6e2dabe57ba11e8faea9aa4fb04cd7b2dc39a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5400ec42823ce9dd7ec0af9b56911140e87ed03e3a335d283424176da15c490
3
  size 14244
checkpoints/{checkpoint-57000 → checkpoint-62164}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6806850443ba2262c5ab64f2e14a406d98dcddc0983f05f3e36b9ed41b5a5444
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44e5b40555d97ad78f23a565d71371cb3eae76e6f0c043cf1a9db3960ced1f75
3
  size 988
checkpoints/{checkpoint-57500 → checkpoint-62164}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f05d878c6ce078869c48780d75fe90680db607079d84d01ce28ff2dde7aa6c1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c868981b82a0a6bae2aa0dd7a6e1355ee650c2f6cfd3fe97b38094bc4d5e87e4
3
  size 1064
checkpoints/{checkpoint-57726 → checkpoint-62164}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-57726 → checkpoint-62164}/spiece.model RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deb4456f78071a8ab6ca5a1698b8a196823b52dea960f8bb0cbcbe082828ead4
3
- size 1040899
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610d79d092a886e9af7d31b94512c46fd20a5f62557bd5b8d8a5b23f1f78650a
3
+ size 1042971
checkpoints/{checkpoint-57726 → checkpoint-62164}/tokenizer.json RENAMED
The diff for this file is too large to render. See raw diff
 
checkpoints/{checkpoint-57726 → checkpoint-62164}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-62164/trainer_state.json ADDED
@@ -0,0 +1,910 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9999517413335477,
6
+ "eval_steps": 500,
7
+ "global_step": 62164,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016086222150727903,
14
+ "grad_norm": 0.5078127384185791,
15
+ "learning_rate": 4.959864230101023e-05,
16
+ "loss": 2.1432,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.032172444301455806,
21
+ "grad_norm": 0.4508506655693054,
22
+ "learning_rate": 4.9196480277974395e-05,
23
+ "loss": 1.9093,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.048258666452183706,
28
+ "grad_norm": 0.4430558979511261,
29
+ "learning_rate": 4.879431825493855e-05,
30
+ "loss": 1.8418,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.06434488860291161,
35
+ "grad_norm": 0.4775325059890747,
36
+ "learning_rate": 4.8392156231902713e-05,
37
+ "loss": 1.7771,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.08043111075363951,
42
+ "grad_norm": 0.49685001373291016,
43
+ "learning_rate": 4.7989994208866876e-05,
44
+ "loss": 1.7226,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.09651733290436741,
49
+ "grad_norm": 0.5552434325218201,
50
+ "learning_rate": 4.7587832185831025e-05,
51
+ "loss": 1.6767,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.11260355505509531,
56
+ "grad_norm": 0.6779139637947083,
57
+ "learning_rate": 4.718567016279519e-05,
58
+ "loss": 1.6588,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.12868977720582322,
63
+ "grad_norm": 0.5552022457122803,
64
+ "learning_rate": 4.6783508139759344e-05,
65
+ "loss": 1.603,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.1447759993565511,
70
+ "grad_norm": 0.5302042365074158,
71
+ "learning_rate": 4.638134611672351e-05,
72
+ "loss": 1.5776,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.16086222150727902,
77
+ "grad_norm": 0.5810815691947937,
78
+ "learning_rate": 4.597918409368766e-05,
79
+ "loss": 1.5333,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 0.1769484436580069,
84
+ "grad_norm": 0.5819700956344604,
85
+ "learning_rate": 4.5577022070651826e-05,
86
+ "loss": 1.5168,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 0.19303466580873482,
91
+ "grad_norm": 0.6134072542190552,
92
+ "learning_rate": 4.517486004761599e-05,
93
+ "loss": 1.4748,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 0.2091208879594627,
98
+ "grad_norm": 0.5746152400970459,
99
+ "learning_rate": 4.4772698024580144e-05,
100
+ "loss": 1.4622,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 0.22520711011019062,
105
+ "grad_norm": 0.7663710713386536,
106
+ "learning_rate": 4.437053600154431e-05,
107
+ "loss": 1.4767,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 0.24129333226091854,
112
+ "grad_norm": 0.7993176579475403,
113
+ "learning_rate": 4.396837397850846e-05,
114
+ "loss": 1.4527,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 0.25737955441164645,
119
+ "grad_norm": 0.6892676949501038,
120
+ "learning_rate": 4.3566211955472626e-05,
121
+ "loss": 1.4325,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 0.2734657765623743,
126
+ "grad_norm": 0.6928556561470032,
127
+ "learning_rate": 4.316404993243678e-05,
128
+ "loss": 1.4038,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 0.2895519987131022,
133
+ "grad_norm": 0.7578593492507935,
134
+ "learning_rate": 4.2761887909400944e-05,
135
+ "loss": 1.3945,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 0.30563822086383013,
140
+ "grad_norm": 0.7504703402519226,
141
+ "learning_rate": 4.23597258863651e-05,
142
+ "loss": 1.3644,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 0.32172444301455805,
147
+ "grad_norm": 0.8370710611343384,
148
+ "learning_rate": 4.1957563863329256e-05,
149
+ "loss": 1.3619,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 0.3378106651652859,
154
+ "grad_norm": 0.8501142263412476,
155
+ "learning_rate": 4.155540184029342e-05,
156
+ "loss": 1.3448,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 0.3538968873160138,
161
+ "grad_norm": 0.9001900553703308,
162
+ "learning_rate": 4.1153239817257575e-05,
163
+ "loss": 1.3004,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 0.36998310946674173,
168
+ "grad_norm": 1.0658681392669678,
169
+ "learning_rate": 4.075107779422174e-05,
170
+ "loss": 1.2789,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 0.38606933161746965,
175
+ "grad_norm": 1.1038371324539185,
176
+ "learning_rate": 4.0348915771185894e-05,
177
+ "loss": 1.2651,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 0.40215555376819756,
182
+ "grad_norm": 1.2004213333129883,
183
+ "learning_rate": 3.994755807219613e-05,
184
+ "loss": 1.2216,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 0.4182417759189254,
189
+ "grad_norm": 1.235543966293335,
190
+ "learning_rate": 3.9545396049160286e-05,
191
+ "loss": 1.1955,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 0.43432799806965333,
196
+ "grad_norm": 1.5088828802108765,
197
+ "learning_rate": 3.914323402612445e-05,
198
+ "loss": 1.1836,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 0.45041422022038125,
203
+ "grad_norm": 1.264153242111206,
204
+ "learning_rate": 3.8741072003088605e-05,
205
+ "loss": 1.1658,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 0.46650044237110916,
210
+ "grad_norm": 1.3023343086242676,
211
+ "learning_rate": 3.833971430409884e-05,
212
+ "loss": 1.1481,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 0.48258666452183707,
217
+ "grad_norm": 1.3824670314788818,
218
+ "learning_rate": 3.7938356605109064e-05,
219
+ "loss": 1.1221,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 0.49867288667256493,
224
+ "grad_norm": 1.4364969730377197,
225
+ "learning_rate": 3.75369989061193e-05,
226
+ "loss": 1.1057,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 0.5147591088232929,
231
+ "grad_norm": 2.051701545715332,
232
+ "learning_rate": 3.7134836883083456e-05,
233
+ "loss": 1.0873,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 0.5308453309740208,
238
+ "grad_norm": 1.4329720735549927,
239
+ "learning_rate": 3.673267486004762e-05,
240
+ "loss": 1.0607,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 0.5469315531247486,
245
+ "grad_norm": 1.4981014728546143,
246
+ "learning_rate": 3.6330512837011775e-05,
247
+ "loss": 1.0516,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 0.5630177752754766,
252
+ "grad_norm": 1.3012079000473022,
253
+ "learning_rate": 3.592835081397594e-05,
254
+ "loss": 1.0317,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 0.5791039974262044,
259
+ "grad_norm": 1.401825189590454,
260
+ "learning_rate": 3.552699311498617e-05,
261
+ "loss": 1.0183,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 0.5951902195769324,
266
+ "grad_norm": 2.0783369541168213,
267
+ "learning_rate": 3.512483109195033e-05,
268
+ "loss": 0.9985,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 0.6112764417276603,
273
+ "grad_norm": 2.3940794467926025,
274
+ "learning_rate": 3.4722669068914486e-05,
275
+ "loss": 0.9698,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 0.6273626638783881,
280
+ "grad_norm": 1.4747998714447021,
281
+ "learning_rate": 3.432050704587865e-05,
282
+ "loss": 0.9657,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 0.6434488860291161,
287
+ "grad_norm": 3.0782012939453125,
288
+ "learning_rate": 3.391914934688888e-05,
289
+ "loss": 0.9379,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 0.659535108179844,
294
+ "grad_norm": 2.4914307594299316,
295
+ "learning_rate": 3.3516987323853034e-05,
296
+ "loss": 0.915,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 0.6756213303305718,
301
+ "grad_norm": 2.772120237350464,
302
+ "learning_rate": 3.3115629624863264e-05,
303
+ "loss": 0.9047,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 0.6917075524812998,
308
+ "grad_norm": 2.519575595855713,
309
+ "learning_rate": 3.271346760182743e-05,
310
+ "loss": 0.8688,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 0.7077937746320276,
315
+ "grad_norm": 4.085098743438721,
316
+ "learning_rate": 3.231130557879158e-05,
317
+ "loss": 0.8581,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 0.7238799967827556,
322
+ "grad_norm": 1.4670002460479736,
323
+ "learning_rate": 3.1909143555755745e-05,
324
+ "loss": 0.8354,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 0.7399662189334835,
329
+ "grad_norm": 2.4749488830566406,
330
+ "learning_rate": 3.1507785856765975e-05,
331
+ "loss": 0.8108,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 0.7560524410842113,
336
+ "grad_norm": 1.8635029792785645,
337
+ "learning_rate": 3.110562383373014e-05,
338
+ "loss": 0.7773,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 0.7721386632349393,
343
+ "grad_norm": 3.5713748931884766,
344
+ "learning_rate": 3.0703461810694294e-05,
345
+ "loss": 0.756,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 0.7882248853856672,
350
+ "grad_norm": 1.8903526067733765,
351
+ "learning_rate": 3.0301299787658456e-05,
352
+ "loss": 0.7326,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 0.8043111075363951,
357
+ "grad_norm": 8.286703109741211,
358
+ "learning_rate": 2.9899942088668686e-05,
359
+ "loss": 0.6948,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 0.820397329687123,
364
+ "grad_norm": 2.2209272384643555,
365
+ "learning_rate": 2.9497780065632845e-05,
366
+ "loss": 0.6914,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 0.8364835518378508,
371
+ "grad_norm": 2.2284536361694336,
372
+ "learning_rate": 2.9095618042597e-05,
373
+ "loss": 0.6585,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 0.8525697739885788,
378
+ "grad_norm": 3.4615938663482666,
379
+ "learning_rate": 2.869345601956116e-05,
380
+ "loss": 0.633,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 0.8686559961393067,
385
+ "grad_norm": 3.1158838272094727,
386
+ "learning_rate": 2.829209832057139e-05,
387
+ "loss": 0.6181,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 0.8847422182900346,
392
+ "grad_norm": 2.3320417404174805,
393
+ "learning_rate": 2.7889936297535553e-05,
394
+ "loss": 0.5993,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 0.9008284404407625,
399
+ "grad_norm": 1.8331427574157715,
400
+ "learning_rate": 2.7487774274499712e-05,
401
+ "loss": 0.5839,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 0.9169146625914903,
406
+ "grad_norm": 3.2398369312286377,
407
+ "learning_rate": 2.708561225146387e-05,
408
+ "loss": 0.562,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 0.9330008847422183,
413
+ "grad_norm": 1.6575061082839966,
414
+ "learning_rate": 2.66842545524741e-05,
415
+ "loss": 0.5313,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 0.9490871068929462,
420
+ "grad_norm": 2.1604230403900146,
421
+ "learning_rate": 2.6282092529438264e-05,
422
+ "loss": 0.5203,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 0.9651733290436741,
427
+ "grad_norm": 3.3743808269500732,
428
+ "learning_rate": 2.5879930506402423e-05,
429
+ "loss": 0.4938,
430
+ "step": 30000
431
+ },
432
+ {
433
+ "epoch": 0.981259551194402,
434
+ "grad_norm": 3.766514301300049,
435
+ "learning_rate": 2.5477768483366583e-05,
436
+ "loss": 0.4724,
437
+ "step": 30500
438
+ },
439
+ {
440
+ "epoch": 0.9973457733451299,
441
+ "grad_norm": 2.26712703704834,
442
+ "learning_rate": 2.5075606460330742e-05,
443
+ "loss": 0.4656,
444
+ "step": 31000
445
+ },
446
+ {
447
+ "epoch": 1.0,
448
+ "eval_loss": 0.26554691791534424,
449
+ "eval_runtime": 1917.4803,
450
+ "eval_samples_per_second": 345.81,
451
+ "eval_steps_per_second": 43.227,
452
+ "step": 31083
453
+ },
454
+ {
455
+ "epoch": 1.0134159092737072,
456
+ "grad_norm": 2.1041958332061768,
457
+ "learning_rate": 2.467424876134097e-05,
458
+ "loss": 0.4381,
459
+ "step": 31500
460
+ },
461
+ {
462
+ "epoch": 1.029502131424435,
463
+ "grad_norm": 1.7629106044769287,
464
+ "learning_rate": 2.427208673830513e-05,
465
+ "loss": 0.4298,
466
+ "step": 32000
467
+ },
468
+ {
469
+ "epoch": 1.0455883535751629,
470
+ "grad_norm": 2.5032904148101807,
471
+ "learning_rate": 2.386992471526929e-05,
472
+ "loss": 0.4188,
473
+ "step": 32500
474
+ },
475
+ {
476
+ "epoch": 1.0616745757258907,
477
+ "grad_norm": 1.6467881202697754,
478
+ "learning_rate": 2.3467762692233446e-05,
479
+ "loss": 0.3986,
480
+ "step": 33000
481
+ },
482
+ {
483
+ "epoch": 1.0777607978766186,
484
+ "grad_norm": 1.957220435142517,
485
+ "learning_rate": 2.3065600669197606e-05,
486
+ "loss": 0.382,
487
+ "step": 33500
488
+ },
489
+ {
490
+ "epoch": 1.0938470200273467,
491
+ "grad_norm": 1.6566946506500244,
492
+ "learning_rate": 2.2663438646161765e-05,
493
+ "loss": 0.3689,
494
+ "step": 34000
495
+ },
496
+ {
497
+ "epoch": 1.1099332421780745,
498
+ "grad_norm": 2.081613540649414,
499
+ "learning_rate": 2.2261276623125928e-05,
500
+ "loss": 0.3603,
501
+ "step": 34500
502
+ },
503
+ {
504
+ "epoch": 1.1260194643288024,
505
+ "grad_norm": 2.155226945877075,
506
+ "learning_rate": 2.1859918924136157e-05,
507
+ "loss": 0.3478,
508
+ "step": 35000
509
+ },
510
+ {
511
+ "epoch": 1.1421056864795303,
512
+ "grad_norm": 1.9459590911865234,
513
+ "learning_rate": 2.1457756901100317e-05,
514
+ "loss": 0.3315,
515
+ "step": 35500
516
+ },
517
+ {
518
+ "epoch": 1.1581919086302581,
519
+ "grad_norm": 2.3381567001342773,
520
+ "learning_rate": 2.1055594878064476e-05,
521
+ "loss": 0.3259,
522
+ "step": 36000
523
+ },
524
+ {
525
+ "epoch": 1.1742781307809862,
526
+ "grad_norm": 1.4302254915237427,
527
+ "learning_rate": 2.0653432855028635e-05,
528
+ "loss": 0.3168,
529
+ "step": 36500
530
+ },
531
+ {
532
+ "epoch": 1.190364352931714,
533
+ "grad_norm": 1.1770597696304321,
534
+ "learning_rate": 2.0251270831992795e-05,
535
+ "loss": 0.3082,
536
+ "step": 37000
537
+ },
538
+ {
539
+ "epoch": 1.206450575082442,
540
+ "grad_norm": 1.7475298643112183,
541
+ "learning_rate": 1.9849913133003024e-05,
542
+ "loss": 0.3014,
543
+ "step": 37500
544
+ },
545
+ {
546
+ "epoch": 1.2225367972331698,
547
+ "grad_norm": 1.2397468090057373,
548
+ "learning_rate": 1.9447751109967187e-05,
549
+ "loss": 0.288,
550
+ "step": 38000
551
+ },
552
+ {
553
+ "epoch": 1.2386230193838976,
554
+ "grad_norm": 1.6603740453720093,
555
+ "learning_rate": 1.9045589086931343e-05,
556
+ "loss": 0.2797,
557
+ "step": 38500
558
+ },
559
+ {
560
+ "epoch": 1.2547092415346257,
561
+ "grad_norm": 1.7009538412094116,
562
+ "learning_rate": 1.8643427063895502e-05,
563
+ "loss": 0.275,
564
+ "step": 39000
565
+ },
566
+ {
567
+ "epoch": 1.2707954636853536,
568
+ "grad_norm": 1.4941717386245728,
569
+ "learning_rate": 1.8241265040859662e-05,
570
+ "loss": 0.2623,
571
+ "step": 39500
572
+ },
573
+ {
574
+ "epoch": 1.2868816858360814,
575
+ "grad_norm": 1.941115140914917,
576
+ "learning_rate": 1.7839907341869895e-05,
577
+ "loss": 0.2572,
578
+ "step": 40000
579
+ },
580
+ {
581
+ "epoch": 1.3029679079868093,
582
+ "grad_norm": 1.487726092338562,
583
+ "learning_rate": 1.7437745318834054e-05,
584
+ "loss": 0.2502,
585
+ "step": 40500
586
+ },
587
+ {
588
+ "epoch": 1.3190541301375371,
589
+ "grad_norm": 1.4628674983978271,
590
+ "learning_rate": 1.7035583295798213e-05,
591
+ "loss": 0.2437,
592
+ "step": 41000
593
+ },
594
+ {
595
+ "epoch": 1.3351403522882652,
596
+ "grad_norm": 1.401607632637024,
597
+ "learning_rate": 1.663342127276237e-05,
598
+ "loss": 0.2421,
599
+ "step": 41500
600
+ },
601
+ {
602
+ "epoch": 1.351226574438993,
603
+ "grad_norm": 1.1497563123703003,
604
+ "learning_rate": 1.623125924972653e-05,
605
+ "loss": 0.231,
606
+ "step": 42000
607
+ },
608
+ {
609
+ "epoch": 1.367312796589721,
610
+ "grad_norm": 1.322836995124817,
611
+ "learning_rate": 1.5829097226690688e-05,
612
+ "loss": 0.2261,
613
+ "step": 42500
614
+ },
615
+ {
616
+ "epoch": 1.3833990187404488,
617
+ "grad_norm": 1.5328525304794312,
618
+ "learning_rate": 1.542773952770092e-05,
619
+ "loss": 0.2177,
620
+ "step": 43000
621
+ },
622
+ {
623
+ "epoch": 1.3994852408911767,
624
+ "grad_norm": 1.7748241424560547,
625
+ "learning_rate": 1.502557750466508e-05,
626
+ "loss": 0.2186,
627
+ "step": 43500
628
+ },
629
+ {
630
+ "epoch": 1.4155714630419047,
631
+ "grad_norm": 1.6542141437530518,
632
+ "learning_rate": 1.4623415481629241e-05,
633
+ "loss": 0.2138,
634
+ "step": 44000
635
+ },
636
+ {
637
+ "epoch": 1.4316576851926326,
638
+ "grad_norm": 1.3098843097686768,
639
+ "learning_rate": 1.4221253458593397e-05,
640
+ "loss": 0.211,
641
+ "step": 44500
642
+ },
643
+ {
644
+ "epoch": 1.4477439073433604,
645
+ "grad_norm": 1.345651626586914,
646
+ "learning_rate": 1.3819091435557557e-05,
647
+ "loss": 0.2027,
648
+ "step": 45000
649
+ },
650
+ {
651
+ "epoch": 1.4638301294940883,
652
+ "grad_norm": 1.4520297050476074,
653
+ "learning_rate": 1.3416929412521718e-05,
654
+ "loss": 0.2039,
655
+ "step": 45500
656
+ },
657
+ {
658
+ "epoch": 1.4799163516448162,
659
+ "grad_norm": 1.5913499593734741,
660
+ "learning_rate": 1.3014767389485877e-05,
661
+ "loss": 0.1939,
662
+ "step": 46000
663
+ },
664
+ {
665
+ "epoch": 1.4960025737955442,
666
+ "grad_norm": 1.1803226470947266,
667
+ "learning_rate": 1.2612605366450037e-05,
668
+ "loss": 0.1887,
669
+ "step": 46500
670
+ },
671
+ {
672
+ "epoch": 1.5120887959462719,
673
+ "grad_norm": 1.1462236642837524,
674
+ "learning_rate": 1.2210443343414194e-05,
675
+ "loss": 0.1883,
676
+ "step": 47000
677
+ },
678
+ {
679
+ "epoch": 1.528175018097,
680
+ "grad_norm": 0.8483968377113342,
681
+ "learning_rate": 1.1808281320378355e-05,
682
+ "loss": 0.1809,
683
+ "step": 47500
684
+ },
685
+ {
686
+ "epoch": 1.5442612402477278,
687
+ "grad_norm": 1.1205823421478271,
688
+ "learning_rate": 1.1406119297342515e-05,
689
+ "loss": 0.1813,
690
+ "step": 48000
691
+ },
692
+ {
693
+ "epoch": 1.5603474623984557,
694
+ "grad_norm": 1.417622447013855,
695
+ "learning_rate": 1.1003957274306672e-05,
696
+ "loss": 0.1788,
697
+ "step": 48500
698
+ },
699
+ {
700
+ "epoch": 1.5764336845491838,
701
+ "grad_norm": 1.179103970527649,
702
+ "learning_rate": 1.0602599575316904e-05,
703
+ "loss": 0.1809,
704
+ "step": 49000
705
+ },
706
+ {
707
+ "epoch": 1.5925199066999114,
708
+ "grad_norm": 1.1092889308929443,
709
+ "learning_rate": 1.0200437552281065e-05,
710
+ "loss": 0.1734,
711
+ "step": 49500
712
+ },
713
+ {
714
+ "epoch": 1.6086061288506395,
715
+ "grad_norm": 1.0196574926376343,
716
+ "learning_rate": 9.798275529245222e-06,
717
+ "loss": 0.1688,
718
+ "step": 50000
719
+ },
720
+ {
721
+ "epoch": 1.6246923510013673,
722
+ "grad_norm": 1.1376862525939941,
723
+ "learning_rate": 9.396113506209382e-06,
724
+ "loss": 0.1703,
725
+ "step": 50500
726
+ },
727
+ {
728
+ "epoch": 1.6407785731520952,
729
+ "grad_norm": 0.8885149955749512,
730
+ "learning_rate": 8.995560131265685e-06,
731
+ "loss": 0.1691,
732
+ "step": 51000
733
+ },
734
+ {
735
+ "epoch": 1.6568647953028233,
736
+ "grad_norm": 1.2574944496154785,
737
+ "learning_rate": 8.593398108229844e-06,
738
+ "loss": 0.1615,
739
+ "step": 51500
740
+ },
741
+ {
742
+ "epoch": 1.672951017453551,
743
+ "grad_norm": 1.2620723247528076,
744
+ "learning_rate": 8.191236085194004e-06,
745
+ "loss": 0.1593,
746
+ "step": 52000
747
+ },
748
+ {
749
+ "epoch": 1.689037239604279,
750
+ "grad_norm": 1.551480770111084,
751
+ "learning_rate": 7.789074062158163e-06,
752
+ "loss": 0.1639,
753
+ "step": 52500
754
+ },
755
+ {
756
+ "epoch": 1.7051234617550068,
757
+ "grad_norm": 1.5938962697982788,
758
+ "learning_rate": 7.386912039122322e-06,
759
+ "loss": 0.1587,
760
+ "step": 53000
761
+ },
762
+ {
763
+ "epoch": 1.7212096839057347,
764
+ "grad_norm": 1.0503953695297241,
765
+ "learning_rate": 6.984750016086482e-06,
766
+ "loss": 0.1599,
767
+ "step": 53500
768
+ },
769
+ {
770
+ "epoch": 1.7372959060564628,
771
+ "grad_norm": 1.1205036640167236,
772
+ "learning_rate": 6.583392317096712e-06,
773
+ "loss": 0.1541,
774
+ "step": 54000
775
+ },
776
+ {
777
+ "epoch": 1.7533821282071904,
778
+ "grad_norm": 0.7524433732032776,
779
+ "learning_rate": 6.181230294060872e-06,
780
+ "loss": 0.1521,
781
+ "step": 54500
782
+ },
783
+ {
784
+ "epoch": 1.7694683503579185,
785
+ "grad_norm": 0.9619775414466858,
786
+ "learning_rate": 5.779068271025031e-06,
787
+ "loss": 0.1521,
788
+ "step": 55000
789
+ },
790
+ {
791
+ "epoch": 1.7855545725086464,
792
+ "grad_norm": 0.9406844973564148,
793
+ "learning_rate": 5.37690624798919e-06,
794
+ "loss": 0.1509,
795
+ "step": 55500
796
+ },
797
+ {
798
+ "epoch": 1.8016407946593742,
799
+ "grad_norm": 0.9363726377487183,
800
+ "learning_rate": 4.975548548999421e-06,
801
+ "loss": 0.1513,
802
+ "step": 56000
803
+ },
804
+ {
805
+ "epoch": 1.8177270168101023,
806
+ "grad_norm": 0.9941402673721313,
807
+ "learning_rate": 4.573386525963581e-06,
808
+ "loss": 0.1484,
809
+ "step": 56500
810
+ },
811
+ {
812
+ "epoch": 1.83381323896083,
813
+ "grad_norm": 1.3756345510482788,
814
+ "learning_rate": 4.17122450292774e-06,
815
+ "loss": 0.1509,
816
+ "step": 57000
817
+ },
818
+ {
819
+ "epoch": 1.849899461111558,
820
+ "grad_norm": 1.0644595623016357,
821
+ "learning_rate": 3.7690624798918986e-06,
822
+ "loss": 0.1486,
823
+ "step": 57500
824
+ },
825
+ {
826
+ "epoch": 1.8659856832622859,
827
+ "grad_norm": 1.070890188217163,
828
+ "learning_rate": 3.3669004568560584e-06,
829
+ "loss": 0.1462,
830
+ "step": 58000
831
+ },
832
+ {
833
+ "epoch": 1.8820719054130137,
834
+ "grad_norm": 1.3034768104553223,
835
+ "learning_rate": 2.9647384338202173e-06,
836
+ "loss": 0.1481,
837
+ "step": 58500
838
+ },
839
+ {
840
+ "epoch": 1.8981581275637418,
841
+ "grad_norm": 1.127517580986023,
842
+ "learning_rate": 2.5625764107843767e-06,
843
+ "loss": 0.1451,
844
+ "step": 59000
845
+ },
846
+ {
847
+ "epoch": 1.9142443497144694,
848
+ "grad_norm": 0.9431403279304504,
849
+ "learning_rate": 2.1604143877485364e-06,
850
+ "loss": 0.1458,
851
+ "step": 59500
852
+ },
853
+ {
854
+ "epoch": 1.9303305718651975,
855
+ "grad_norm": 1.271483302116394,
856
+ "learning_rate": 1.7590566887587673e-06,
857
+ "loss": 0.1463,
858
+ "step": 60000
859
+ },
860
+ {
861
+ "epoch": 1.9464167940159254,
862
+ "grad_norm": 0.7327952980995178,
863
+ "learning_rate": 1.3568946657229264e-06,
864
+ "loss": 0.1434,
865
+ "step": 60500
866
+ },
867
+ {
868
+ "epoch": 1.9625030161666532,
869
+ "grad_norm": 1.0670543909072876,
870
+ "learning_rate": 9.547326426870858e-07,
871
+ "loss": 0.1424,
872
+ "step": 61000
873
+ },
874
+ {
875
+ "epoch": 1.9785892383173813,
876
+ "grad_norm": 1.2705425024032593,
877
+ "learning_rate": 5.525706196512451e-07,
878
+ "loss": 0.1431,
879
+ "step": 61500
880
+ },
881
+ {
882
+ "epoch": 1.994675460468109,
883
+ "grad_norm": 0.9267213344573975,
884
+ "learning_rate": 1.5040859661540443e-07,
885
+ "loss": 0.1418,
886
+ "step": 62000
887
+ }
888
+ ],
889
+ "logging_steps": 500,
890
+ "max_steps": 62164,
891
+ "num_input_tokens_seen": 0,
892
+ "num_train_epochs": 2,
893
+ "save_steps": 500,
894
+ "stateful_callbacks": {
895
+ "TrainerControl": {
896
+ "args": {
897
+ "should_epoch_stop": false,
898
+ "should_evaluate": false,
899
+ "should_log": false,
900
+ "should_save": true,
901
+ "should_training_stop": true
902
+ },
903
+ "attributes": {}
904
+ }
905
+ },
906
+ "total_flos": 1.3461207231391334e+17,
907
+ "train_batch_size": 32,
908
+ "trial_name": null,
909
+ "trial_params": null
910
+ }
checkpoints/{checkpoint-57726 → checkpoint-62164}/training_args.bin RENAMED
File without changes
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:301a885d1d6deadcc8cd3f69d48c233773d17dfd7a9248c17b8ac07c0d7ad7bc
3
  size 258368552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e8c8770f6dcbe1b9d9475af0bda9b1c3d8b183e7518931c4ee48457079834ba
3
  size 258368552
spiece.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deb4456f78071a8ab6ca5a1698b8a196823b52dea960f8bb0cbcbe082828ead4
3
- size 1040899
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610d79d092a886e9af7d31b94512c46fd20a5f62557bd5b8d8a5b23f1f78650a
3
+ size 1042971
src/data/generate_cyr_lat_pairs.py CHANGED
@@ -110,7 +110,7 @@ with open(output_path, 'w', encoding = "utf-8") as out_file:
110
 
111
  with open(output_path, 'a', encoding = "utf-8") as out_file:
112
  with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
113
- for line in tqdm(islice(f, 2_000_000), total = 2_000_000, desc = "Lines in CC100-Kazakh"):
114
  try:
115
  cyr_text = line.strip()
116
  lat_text = convert_to_latin(cyr_text).strip()
 
110
 
111
  with open(output_path, 'a', encoding = "utf-8") as out_file:
112
  with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
113
+ for line in tqdm(islice(f, 2_200_000), total = 2_200_000, desc = "Lines in CC100-Kazakh"):
114
  try:
115
  cyr_text = line.strip()
116
  lat_text = convert_to_latin(cyr_text).strip()
src/tokeniser/dalat5_sp.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deb4456f78071a8ab6ca5a1698b8a196823b52dea960f8bb0cbcbe082828ead4
3
- size 1040899
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610d79d092a886e9af7d31b94512c46fd20a5f62557bd5b8d8a5b23f1f78650a
3
+ size 1042971
src/tokeniser/dalat5_sp.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
src/tokeniser/spiece.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deb4456f78071a8ab6ca5a1698b8a196823b52dea960f8bb0cbcbe082828ead4
3
- size 1040899
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610d79d092a886e9af7d31b94512c46fd20a5f62557bd5b8d8a5b23f1f78650a
3
+ size 1042971
src/train_tokeniser.py CHANGED
@@ -26,6 +26,7 @@ spm.SentencePieceTrainer.Train(
26
  vocab_size = 40000,
27
  model_type = "unigram", # worth testing with "bpe"
28
  character_coverage = 1.0, # to preserve rare characters like ä, ñ, etc.
 
29
  pad_id = 0,
30
  unk_id = 1,
31
  bos_id = 2,
 
26
  vocab_size = 40000,
27
  model_type = "unigram", # worth testing with "bpe"
28
  character_coverage = 1.0, # to preserve rare characters like ä, ñ, etc.
29
+ max_sentence_length = 8384,
30
  pad_id = 0,
31
  unk_id = 1,
32
  bos_id = 2,