apriasmoro commited on
Commit
099a7b4
·
verified ·
1 Parent(s): 38889e6

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "up_proj",
29
  "o_proj",
30
  "k_proj",
 
 
31
  "gate_proj",
32
  "q_proj",
33
- "v_proj",
34
- "down_proj"
35
  ],
36
  "task_type": "CAUSAL_LM",
37
  "trainable_token_indices": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
 
28
  "o_proj",
29
  "k_proj",
30
+ "v_proj",
31
+ "down_proj",
32
  "gate_proj",
33
  "q_proj",
34
+ "up_proj"
 
35
  ],
36
  "task_type": "CAUSAL_LM",
37
  "trainable_token_indices": null,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ba60744f926268ff7fdba5a7cfac8f3f850cb521c5738e6eb47d7d4e2da6d8f
3
  size 349243752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b9fbf5d11589875c8f0f18e5f8873568a3bc8596407bb07ad845db4518cf642
3
  size 349243752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1366b8acb325858c396a2b90bc1e7d009944272ff6b31772e996c3a077d88767
3
  size 177908741
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc343d82a0e3f72cfba0430c8d86e3dcbe33feff1416efe2cf854f3b74275f8
3
  size 177908741
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15245f502f2b0897efe7ae5bd9aea29ced0716f57f68c802c749e42ddc72ee1c
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dd9c2e9d70880f16863f7e49763010a424edf0e7cc0c70adea8dfa600069583
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da0b998422a3dc253ae0972fd9207eebf2190589880dd54501b58c1760fdda21
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f7440e8d8eb662ab89372782909ec9a7c72ecb22b3a157dc2f42bb2972c021b
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,156 +2,191 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.01990049751243781,
6
  "eval_steps": 500,
7
- "global_step": 20,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.0009950248756218905,
14
- "grad_norm": 0.8776684999465942,
15
- "learning_rate": 0.0,
16
- "loss": 1.6946,
17
- "step": 1
18
  },
19
  {
20
- "epoch": 0.001990049751243781,
21
- "grad_norm": 0.9731295704841614,
22
- "learning_rate": 7e-07,
23
- "loss": 1.852,
24
- "step": 2
25
  },
26
  {
27
- "epoch": 0.0029850746268656717,
28
- "grad_norm": 0.9005614519119263,
29
- "learning_rate": 1.4e-06,
30
- "loss": 1.7867,
31
- "step": 3
32
  },
33
  {
34
- "epoch": 0.003980099502487562,
35
- "grad_norm": 0.974780261516571,
36
- "learning_rate": 2.1e-06,
37
- "loss": 1.7229,
38
- "step": 4
39
  },
40
  {
41
- "epoch": 0.004975124378109453,
42
- "grad_norm": 0.711536169052124,
43
- "learning_rate": 2.8e-06,
44
- "loss": 1.6955,
45
- "step": 5
46
  },
47
  {
48
- "epoch": 0.005970149253731343,
49
- "grad_norm": 0.7311899065971375,
50
- "learning_rate": 3.5e-06,
51
- "loss": 1.8664,
52
- "step": 6
53
  },
54
  {
55
- "epoch": 0.006965174129353234,
56
- "grad_norm": 0.7126452326774597,
57
- "learning_rate": 4.2e-06,
58
- "loss": 1.8133,
59
- "step": 7
60
  },
61
  {
62
- "epoch": 0.007960199004975124,
63
- "grad_norm": 0.7019472122192383,
64
- "learning_rate": 4.9e-06,
65
- "loss": 1.6724,
66
- "step": 8
67
  },
68
  {
69
- "epoch": 0.008955223880597015,
70
- "grad_norm": 0.7028383016586304,
71
- "learning_rate": 5.6e-06,
72
- "loss": 1.7337,
73
- "step": 9
74
  },
75
  {
76
- "epoch": 0.009950248756218905,
77
- "grad_norm": 0.6948546767234802,
78
- "learning_rate": 6.299999999999999e-06,
79
- "loss": 1.6073,
80
- "step": 10
81
  },
82
  {
83
- "epoch": 0.010945273631840797,
84
- "grad_norm": 0.6322774291038513,
85
- "learning_rate": 7e-06,
86
- "loss": 1.6254,
87
- "step": 11
88
  },
89
  {
90
- "epoch": 0.011940298507462687,
91
- "grad_norm": 0.5230722427368164,
92
- "learning_rate": 7.699999999999999e-06,
93
- "loss": 1.704,
94
- "step": 12
95
  },
96
  {
97
- "epoch": 0.012935323383084577,
98
- "grad_norm": 0.38045769929885864,
99
- "learning_rate": 8.4e-06,
100
- "loss": 1.5992,
101
- "step": 13
102
  },
103
  {
104
- "epoch": 0.013930348258706468,
105
- "grad_norm": 0.43926432728767395,
106
- "learning_rate": 9.1e-06,
107
- "loss": 1.3649,
108
- "step": 14
109
  },
110
  {
111
- "epoch": 0.014925373134328358,
112
- "grad_norm": 0.6113471388816833,
113
- "learning_rate": 9.8e-06,
114
- "loss": 1.7055,
115
- "step": 15
116
  },
117
  {
118
- "epoch": 0.015920398009950248,
119
- "grad_norm": 0.549103856086731,
120
- "learning_rate": 1.05e-05,
121
- "loss": 1.6179,
122
- "step": 16
123
  },
124
  {
125
- "epoch": 0.01691542288557214,
126
- "grad_norm": 0.39344537258148193,
127
- "learning_rate": 1.12e-05,
128
- "loss": 1.517,
129
- "step": 17
130
  },
131
  {
132
- "epoch": 0.01791044776119403,
133
- "grad_norm": 0.3620043098926544,
134
- "learning_rate": 1.19e-05,
135
- "loss": 1.4598,
136
- "step": 18
137
  },
138
  {
139
- "epoch": 0.01890547263681592,
140
- "grad_norm": 0.271251380443573,
141
- "learning_rate": 1.2599999999999998e-05,
142
- "loss": 1.441,
143
- "step": 19
144
  },
145
  {
146
- "epoch": 0.01990049751243781,
147
- "grad_norm": 0.2406337559223175,
148
- "learning_rate": 1.33e-05,
149
- "loss": 1.5409,
150
- "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  }
152
  ],
153
- "logging_steps": 1,
154
- "max_steps": 20,
155
  "num_input_tokens_seen": 0,
156
  "num_train_epochs": 1,
157
  "save_steps": 100,
@@ -162,12 +197,12 @@
162
  "should_evaluate": false,
163
  "should_log": false,
164
  "should_save": true,
165
- "should_training_stop": true
166
  },
167
  "attributes": {}
168
  }
169
  },
170
- "total_flos": 1.5380973908656128e+16,
171
  "train_batch_size": 24,
172
  "trial_name": null,
173
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.09950248756218906,
6
  "eval_steps": 500,
7
+ "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.003980099502487562,
14
+ "grad_norm": 1.0094934701919556,
15
+ "learning_rate": 1.3649999999999998e-07,
16
+ "loss": 1.8949,
17
+ "step": 4
18
  },
19
  {
20
+ "epoch": 0.007960199004975124,
21
+ "grad_norm": 0.9780156016349792,
22
+ "learning_rate": 3.185e-07,
23
+ "loss": 1.966,
24
+ "step": 8
25
  },
26
  {
27
+ "epoch": 0.011940298507462687,
28
+ "grad_norm": 0.9392228722572327,
29
+ "learning_rate": 5.005e-07,
30
+ "loss": 1.9327,
31
+ "step": 12
32
  },
33
  {
34
+ "epoch": 0.015920398009950248,
35
+ "grad_norm": 0.7723965644836426,
36
+ "learning_rate": 6.824999999999999e-07,
37
+ "loss": 1.6594,
38
+ "step": 16
39
  },
40
  {
41
+ "epoch": 0.01990049751243781,
42
+ "grad_norm": 0.7648000121116638,
43
+ "learning_rate": 8.644999999999999e-07,
44
+ "loss": 1.8772,
45
+ "step": 20
46
  },
47
  {
48
+ "epoch": 0.023880597014925373,
49
+ "grad_norm": 0.7800928950309753,
50
+ "learning_rate": 1.0465e-06,
51
+ "loss": 1.7927,
52
+ "step": 24
53
  },
54
  {
55
+ "epoch": 0.027860696517412936,
56
+ "grad_norm": 0.6133188605308533,
57
+ "learning_rate": 1.2285e-06,
58
+ "loss": 1.8074,
59
+ "step": 28
60
  },
61
  {
62
+ "epoch": 0.031840796019900496,
63
+ "grad_norm": 0.6503555774688721,
64
+ "learning_rate": 1.4104999999999999e-06,
65
+ "loss": 1.8108,
66
+ "step": 32
67
  },
68
  {
69
+ "epoch": 0.03582089552238806,
70
+ "grad_norm": 0.6871572136878967,
71
+ "learning_rate": 1.5924999999999998e-06,
72
+ "loss": 1.6947,
73
+ "step": 36
74
  },
75
  {
76
+ "epoch": 0.03980099502487562,
77
+ "grad_norm": 0.5543396472930908,
78
+ "learning_rate": 1.7745e-06,
79
+ "loss": 1.6361,
80
+ "step": 40
81
  },
82
  {
83
+ "epoch": 0.04378109452736319,
84
+ "grad_norm": 0.502059280872345,
85
+ "learning_rate": 1.9565e-06,
86
+ "loss": 1.6627,
87
+ "step": 44
88
  },
89
  {
90
+ "epoch": 0.04776119402985075,
91
+ "grad_norm": 0.4744930565357208,
92
+ "learning_rate": 2.1384999999999995e-06,
93
+ "loss": 1.6207,
94
+ "step": 48
95
  },
96
  {
97
+ "epoch": 0.051741293532338306,
98
+ "grad_norm": 0.4121508300304413,
99
+ "learning_rate": 2.3205e-06,
100
+ "loss": 1.5601,
101
+ "step": 52
102
  },
103
  {
104
+ "epoch": 0.05572139303482587,
105
+ "grad_norm": 0.364520788192749,
106
+ "learning_rate": 2.5025e-06,
107
+ "loss": 1.605,
108
+ "step": 56
109
  },
110
  {
111
+ "epoch": 0.05970149253731343,
112
+ "grad_norm": 0.34304410219192505,
113
+ "learning_rate": 2.6844999999999995e-06,
114
+ "loss": 1.556,
115
+ "step": 60
116
  },
117
  {
118
+ "epoch": 0.06368159203980099,
119
+ "grad_norm": 0.28909891843795776,
120
+ "learning_rate": 2.8665e-06,
121
+ "loss": 1.6229,
122
+ "step": 64
123
  },
124
  {
125
+ "epoch": 0.06766169154228856,
126
+ "grad_norm": 0.2927353084087372,
127
+ "learning_rate": 3.0485e-06,
128
+ "loss": 1.6583,
129
+ "step": 68
130
  },
131
  {
132
+ "epoch": 0.07164179104477612,
133
+ "grad_norm": 0.2617412805557251,
134
+ "learning_rate": 3.2304999999999994e-06,
135
+ "loss": 1.3631,
136
+ "step": 72
137
  },
138
  {
139
+ "epoch": 0.07562189054726368,
140
+ "grad_norm": 0.20857785642147064,
141
+ "learning_rate": 3.4125e-06,
142
+ "loss": 1.4188,
143
+ "step": 76
144
  },
145
  {
146
+ "epoch": 0.07960199004975124,
147
+ "grad_norm": 0.21844792366027832,
148
+ "learning_rate": 3.5945e-06,
149
+ "loss": 1.4926,
150
+ "step": 80
151
+ },
152
+ {
153
+ "epoch": 0.08358208955223881,
154
+ "grad_norm": 0.21169371902942657,
155
+ "learning_rate": 3.7764999999999993e-06,
156
+ "loss": 1.4431,
157
+ "step": 84
158
+ },
159
+ {
160
+ "epoch": 0.08756218905472637,
161
+ "grad_norm": 0.2326797991991043,
162
+ "learning_rate": 3.9584999999999995e-06,
163
+ "loss": 1.453,
164
+ "step": 88
165
+ },
166
+ {
167
+ "epoch": 0.09154228855721393,
168
+ "grad_norm": 0.184279665350914,
169
+ "learning_rate": 4.1404999999999996e-06,
170
+ "loss": 1.4547,
171
+ "step": 92
172
+ },
173
+ {
174
+ "epoch": 0.0955223880597015,
175
+ "grad_norm": 0.21595323085784912,
176
+ "learning_rate": 4.3225e-06,
177
+ "loss": 1.5044,
178
+ "step": 96
179
+ },
180
+ {
181
+ "epoch": 0.09950248756218906,
182
+ "grad_norm": 0.22414354979991913,
183
+ "learning_rate": 4.5045e-06,
184
+ "loss": 1.5599,
185
+ "step": 100
186
  }
187
  ],
188
+ "logging_steps": 4,
189
+ "max_steps": 972,
190
  "num_input_tokens_seen": 0,
191
  "num_train_epochs": 1,
192
  "save_steps": 100,
 
197
  "should_evaluate": false,
198
  "should_log": false,
199
  "should_save": true,
200
+ "should_training_stop": false
201
  },
202
  "attributes": {}
203
  }
204
  },
205
+ "total_flos": 7.46000752878551e+16,
206
  "train_batch_size": 24,
207
  "trial_name": null,
208
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15283826ff4848bc5c704b3622e4600c644b4dbefeeb2c0a0476dd07ee352d21
3
  size 7697
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aa8c29fb1ed8d66b99ca0913e01151bf5c6d50026712b90dfade04ea18efec8
3
  size 7697