scottsuk0306 commited on
Commit
a8c0130
1 Parent(s): 67a44e1

Model save

Browse files
README.md CHANGED
@@ -2,10 +2,6 @@
2
  license: gemma
3
  base_model: google/gemma-2-9b
4
  tags:
5
- - easylm
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -23,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the alpaca_farm dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.9946
27
 
28
  ## Model description
29
 
@@ -43,24 +39,33 @@ More information needed
43
 
44
  The following hyperparameters were used during training:
45
  - learning_rate: 3e-06
46
- - train_batch_size: 1
47
- - eval_batch_size: 1
48
  - seed: 42
49
  - distributed_type: multi-GPU
50
  - num_devices: 8
51
- - total_train_batch_size: 8
52
- - total_eval_batch_size: 8
53
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
  - lr_scheduler_type: cosine
55
- - num_epochs: 3
56
 
57
  ### Training results
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 0.6413 | 1.0 | 1250 | 0.6420 |
62
- | 0.3318 | 2.0 | 2500 | 0.7324 |
63
- | 0.1518 | 3.0 | 3750 | 0.9946 |
 
 
 
 
 
 
 
 
 
64
 
65
 
66
  ### Framework versions
 
2
  license: gemma
3
  base_model: google/gemma-2-9b
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
19
 
20
  This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the alpaca_farm dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.6364
23
 
24
  ## Model description
25
 
 
39
 
40
  The following hyperparameters were used during training:
41
  - learning_rate: 3e-06
42
+ - train_batch_size: 2
43
+ - eval_batch_size: 2
44
  - seed: 42
45
  - distributed_type: multi-GPU
46
  - num_devices: 8
47
+ - total_train_batch_size: 16
48
+ - total_eval_batch_size: 16
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
+ - num_epochs: 1
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.7159 | 0.08 | 50 | 0.6576 |
58
+ | 0.6551 | 0.16 | 100 | 0.6588 |
59
+ | 0.6519 | 0.24 | 150 | 0.6581 |
60
+ | 0.6278 | 0.32 | 200 | 0.6568 |
61
+ | 0.6394 | 0.4 | 250 | 0.6533 |
62
+ | 0.6528 | 0.48 | 300 | 0.6503 |
63
+ | 0.6382 | 0.56 | 350 | 0.6454 |
64
+ | 0.638 | 0.64 | 400 | 0.6426 |
65
+ | 0.618 | 0.72 | 450 | 0.6400 |
66
+ | 0.6378 | 0.8 | 500 | 0.6379 |
67
+ | 0.6338 | 0.88 | 550 | 0.6368 |
68
+ | 0.6284 | 0.96 | 600 | 0.6364 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_loss": 0.994577169418335,
4
- "eval_runtime": 20.7473,
5
- "eval_samples": 2000,
6
- "eval_samples_per_second": 96.398,
7
- "eval_steps_per_second": 12.05,
8
- "total_flos": 2.3924465493816115e+17,
9
- "train_loss": 0.37510518595377607,
10
- "train_runtime": 2372.5928,
11
  "train_samples": 10000,
12
- "train_samples_per_second": 12.644,
13
- "train_steps_per_second": 1.581
14
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 9.391098276138189e+16,
4
+ "train_loss": 0.6481949188232422,
5
+ "train_runtime": 2478.7348,
 
 
 
 
 
6
  "train_samples": 10000,
7
+ "train_samples_per_second": 4.034,
8
+ "train_steps_per_second": 0.252
9
  }
model-00001-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:730503b5f82e8151507cd063b17d4a5da741fbf0d91f70f7056373da25886c21
3
  size 4903351912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9528047cd1ea28eca3499b2b8a7b3d261c4d42dcd8b1e7b036a8ea967c937c23
3
  size 4903351912
model-00002-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36c4b5075fef66eeffcce36a789b1e3dcdf2bf17e286565ef22c960fd276bc02
3
  size 4947570872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6fa1e3201d30cf2996f1ad9490be4dda21cdece8f024eb2959c26ac19260d16
3
  size 4947570872
model-00003-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b993b8ecc8b2ba33cd90ed3d128f1545205f0deac2f2979a72999f00830fe263
3
  size 4962221464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19006e8c3d14b2f67c927289f4e75f6cfd842b28d6191dcebd75e6d2e49678f0
3
  size 4962221464
model-00004-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:460270e8f8514cd94701a49a0d305d9438dbe482f265593fbf86d875001d607a
3
  size 3670322200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:976c695d8c47863f06f254d22b97010871fb4b245c647731fc79f81a7f937ced
3
  size 3670322200
model-00005-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d99a8866f4395ac169b4742c110000d655fc6b67d57ece2889124efb79be7d0d
3
  size 1835008128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbbd5e5b60ca1338e216c06b6553729cf4a8960abbe6d585a0841e73da224978
3
  size 1835008128
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 2.3924465493816115e+17,
4
- "train_loss": 0.37510518595377607,
5
- "train_runtime": 2372.5928,
6
  "train_samples": 10000,
7
- "train_samples_per_second": 12.644,
8
- "train_steps_per_second": 1.581
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 9.391098276138189e+16,
4
+ "train_loss": 0.6481949188232422,
5
+ "train_runtime": 2478.7348,
6
  "train_samples": 10000,
7
+ "train_samples_per_second": 4.034,
8
+ "train_steps_per_second": 0.252
9
  }
trainer_state.json CHANGED
@@ -1,100 +1,207 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
- "eval_steps": 500,
6
- "global_step": 3750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.4,
13
- "grad_norm": 4.658291339874268,
14
- "learning_rate": 2.8703181864639013e-06,
15
- "loss": 0.64,
16
- "step": 500
17
  },
18
  {
19
- "epoch": 0.8,
20
- "grad_norm": 4.34930944442749,
21
- "learning_rate": 2.5036959095382875e-06,
22
- "loss": 0.6413,
23
- "step": 1000
 
24
  },
25
  {
26
- "epoch": 1.0,
27
- "eval_loss": 0.6420477032661438,
28
- "eval_runtime": 20.5136,
29
- "eval_samples_per_second": 97.497,
30
- "eval_steps_per_second": 12.187,
31
- "step": 1250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
  {
34
- "epoch": 1.2,
35
- "grad_norm": 5.983999252319336,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "learning_rate": 1.963525491562421e-06,
37
- "loss": 0.4847,
38
- "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  },
40
  {
41
- "epoch": 1.6,
42
- "grad_norm": 5.177482604980469,
43
- "learning_rate": 1.3432073050985201e-06,
44
- "loss": 0.3344,
45
- "step": 2000
 
46
  },
47
  {
48
- "epoch": 2.0,
49
- "grad_norm": 4.585903167724609,
50
- "learning_rate": 7.500000000000003e-07,
51
- "loss": 0.3318,
52
- "step": 2500
53
  },
54
  {
55
- "epoch": 2.0,
56
- "eval_loss": 0.7324458360671997,
57
- "eval_runtime": 20.5637,
58
- "eval_samples_per_second": 97.259,
59
- "eval_steps_per_second": 12.157,
60
- "step": 2500
61
  },
62
  {
63
- "epoch": 2.4,
64
- "grad_norm": 5.989898681640625,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  "learning_rate": 2.86474508437579e-07,
66
- "loss": 0.1551,
67
- "step": 3000
68
  },
69
  {
70
- "epoch": 2.8,
71
- "grad_norm": 5.329368591308594,
72
- "learning_rate": 3.277859889929147e-08,
73
- "loss": 0.1518,
74
- "step": 3500
 
75
  },
76
  {
77
- "epoch": 3.0,
78
- "eval_loss": 0.994577169418335,
79
- "eval_runtime": 20.6046,
80
- "eval_samples_per_second": 97.066,
81
- "eval_steps_per_second": 12.133,
82
- "step": 3750
83
  },
84
  {
85
- "epoch": 3.0,
86
- "step": 3750,
87
- "total_flos": 2.3924465493816115e+17,
88
- "train_loss": 0.37510518595377607,
89
- "train_runtime": 2372.5928,
90
- "train_samples_per_second": 12.644,
91
- "train_steps_per_second": 1.581
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  }
93
  ],
94
- "logging_steps": 500,
95
- "max_steps": 3750,
96
  "num_input_tokens_seen": 0,
97
- "num_train_epochs": 3,
98
  "save_steps": 500,
99
  "stateful_callbacks": {
100
  "TrainerControl": {
@@ -102,14 +209,14 @@
102
  "should_epoch_stop": false,
103
  "should_evaluate": false,
104
  "should_log": false,
105
- "should_save": false,
106
- "should_training_stop": false
107
  },
108
  "attributes": {}
109
  }
110
  },
111
- "total_flos": 2.3924465493816115e+17,
112
- "train_batch_size": 1,
113
  "trial_name": null,
114
  "trial_params": null
115
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 50,
6
+ "global_step": 625,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08,
13
+ "grad_norm": 4.338859558105469,
14
+ "learning_rate": 2.9528747416929465e-06,
15
+ "loss": 0.7159,
16
+ "step": 50
17
  },
18
  {
19
+ "epoch": 0.08,
20
+ "eval_loss": 0.6576318144798279,
21
+ "eval_runtime": 12.5345,
22
+ "eval_samples_per_second": 159.56,
23
+ "eval_steps_per_second": 9.972,
24
+ "step": 50
25
  },
26
  {
27
+ "epoch": 0.16,
28
+ "grad_norm": 4.263615131378174,
29
+ "learning_rate": 2.814460020065795e-06,
30
+ "loss": 0.6551,
31
+ "step": 100
32
+ },
33
+ {
34
+ "epoch": 0.16,
35
+ "eval_loss": 0.6588318943977356,
36
+ "eval_runtime": 12.1193,
37
+ "eval_samples_per_second": 165.026,
38
+ "eval_steps_per_second": 10.314,
39
+ "step": 100
40
+ },
41
+ {
42
+ "epoch": 0.24,
43
+ "grad_norm": 4.684665679931641,
44
+ "learning_rate": 2.5934529411321173e-06,
45
+ "loss": 0.6519,
46
+ "step": 150
47
+ },
48
+ {
49
+ "epoch": 0.24,
50
+ "eval_loss": 0.6581148505210876,
51
+ "eval_runtime": 12.4709,
52
+ "eval_samples_per_second": 160.373,
53
+ "eval_steps_per_second": 10.023,
54
+ "step": 150
55
  },
56
  {
57
+ "epoch": 0.32,
58
+ "grad_norm": 4.372674942016602,
59
+ "learning_rate": 2.303740192468495e-06,
60
+ "loss": 0.6278,
61
+ "step": 200
62
+ },
63
+ {
64
+ "epoch": 0.32,
65
+ "eval_loss": 0.656774640083313,
66
+ "eval_runtime": 12.0295,
67
+ "eval_samples_per_second": 166.258,
68
+ "eval_steps_per_second": 10.391,
69
+ "step": 200
70
+ },
71
+ {
72
+ "epoch": 0.4,
73
+ "grad_norm": 3.940370798110962,
74
  "learning_rate": 1.963525491562421e-06,
75
+ "loss": 0.6394,
76
+ "step": 250
77
+ },
78
+ {
79
+ "epoch": 0.4,
80
+ "eval_loss": 0.6532722115516663,
81
+ "eval_runtime": 12.0522,
82
+ "eval_samples_per_second": 165.945,
83
+ "eval_steps_per_second": 10.372,
84
+ "step": 250
85
+ },
86
+ {
87
+ "epoch": 0.48,
88
+ "grad_norm": 4.178117752075195,
89
+ "learning_rate": 1.5941857792939703e-06,
90
+ "loss": 0.6528,
91
+ "step": 300
92
  },
93
  {
94
+ "epoch": 0.48,
95
+ "eval_loss": 0.6502550840377808,
96
+ "eval_runtime": 12.0395,
97
+ "eval_samples_per_second": 166.12,
98
+ "eval_steps_per_second": 10.383,
99
+ "step": 300
100
  },
101
  {
102
+ "epoch": 0.56,
103
+ "grad_norm": 3.7875773906707764,
104
+ "learning_rate": 1.2189280281214128e-06,
105
+ "loss": 0.6382,
106
+ "step": 350
107
  },
108
  {
109
+ "epoch": 0.56,
110
+ "eval_loss": 0.6453887820243835,
111
+ "eval_runtime": 12.102,
112
+ "eval_samples_per_second": 165.261,
113
+ "eval_steps_per_second": 10.329,
114
+ "step": 350
115
  },
116
  {
117
+ "epoch": 0.64,
118
+ "grad_norm": 3.9947192668914795,
119
+ "learning_rate": 8.613310626523911e-07,
120
+ "loss": 0.638,
121
+ "step": 400
122
+ },
123
+ {
124
+ "epoch": 0.64,
125
+ "eval_loss": 0.6425909399986267,
126
+ "eval_runtime": 12.0649,
127
+ "eval_samples_per_second": 165.77,
128
+ "eval_steps_per_second": 10.361,
129
+ "step": 400
130
+ },
131
+ {
132
+ "epoch": 0.72,
133
+ "grad_norm": 3.9734668731689453,
134
+ "learning_rate": 5.438640153769653e-07,
135
+ "loss": 0.618,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.72,
140
+ "eval_loss": 0.6400230526924133,
141
+ "eval_runtime": 12.0503,
142
+ "eval_samples_per_second": 165.97,
143
+ "eval_steps_per_second": 10.373,
144
+ "step": 450
145
+ },
146
+ {
147
+ "epoch": 0.8,
148
+ "grad_norm": 3.733959436416626,
149
  "learning_rate": 2.86474508437579e-07,
150
+ "loss": 0.6378,
151
+ "step": 500
152
  },
153
  {
154
+ "epoch": 0.8,
155
+ "eval_loss": 0.6379128694534302,
156
+ "eval_runtime": 12.1078,
157
+ "eval_samples_per_second": 165.183,
158
+ "eval_steps_per_second": 10.324,
159
+ "step": 500
160
  },
161
  {
162
+ "epoch": 0.88,
163
+ "grad_norm": 3.8779754638671875,
164
+ "learning_rate": 1.0533527116762298e-07,
165
+ "loss": 0.6338,
166
+ "step": 550
 
167
  },
168
  {
169
+ "epoch": 0.88,
170
+ "eval_loss": 0.636811375617981,
171
+ "eval_runtime": 13.2199,
172
+ "eval_samples_per_second": 151.287,
173
+ "eval_steps_per_second": 9.455,
174
+ "step": 550
175
+ },
176
+ {
177
+ "epoch": 0.96,
178
+ "grad_norm": 3.924581527709961,
179
+ "learning_rate": 1.1827948028283353e-08,
180
+ "loss": 0.6284,
181
+ "step": 600
182
+ },
183
+ {
184
+ "epoch": 0.96,
185
+ "eval_loss": 0.6364374160766602,
186
+ "eval_runtime": 14.3025,
187
+ "eval_samples_per_second": 139.836,
188
+ "eval_steps_per_second": 8.74,
189
+ "step": 600
190
+ },
191
+ {
192
+ "epoch": 1.0,
193
+ "step": 625,
194
+ "total_flos": 9.391098276138189e+16,
195
+ "train_loss": 0.6481949188232422,
196
+ "train_runtime": 2478.7348,
197
+ "train_samples_per_second": 4.034,
198
+ "train_steps_per_second": 0.252
199
  }
200
  ],
201
+ "logging_steps": 50,
202
+ "max_steps": 625,
203
  "num_input_tokens_seen": 0,
204
+ "num_train_epochs": 1,
205
  "save_steps": 500,
206
  "stateful_callbacks": {
207
  "TrainerControl": {
 
209
  "should_epoch_stop": false,
210
  "should_evaluate": false,
211
  "should_log": false,
212
+ "should_save": true,
213
+ "should_training_stop": true
214
  },
215
  "attributes": {}
216
  }
217
  },
218
+ "total_flos": 9.391098276138189e+16,
219
+ "train_batch_size": 2,
220
  "trial_name": null,
221
  "trial_params": null
222
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:545922062b1251f9b6ef9cd44ba987560769788f8b01bffb8d5688fdbe889de3
3
- size 6520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1aaec3c721f712e5fe9b43e7244ccf3c19368e555dbdccc7c766e21f77afd02c
3
+ size 6456