scottsuk0306 commited on
Commit
abd3264
1 Parent(s): 78f14a9

Model save

Browse files
README.md CHANGED
@@ -2,10 +2,6 @@
2
  license: gemma
3
  base_model: google/gemma-2-9b
4
  tags:
5
- - easylm
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -23,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the alpaca_farm dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 1.0189
27
 
28
  ## Model description
29
 
@@ -43,13 +39,13 @@ More information needed
43
 
44
  The following hyperparameters were used during training:
45
  - learning_rate: 2e-05
46
- - train_batch_size: 2
47
- - eval_batch_size: 2
48
  - seed: 42
49
  - distributed_type: multi-GPU
50
  - num_devices: 4
51
- - total_train_batch_size: 8
52
- - total_eval_batch_size: 8
53
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
  - lr_scheduler_type: cosine
55
  - num_epochs: 3
@@ -58,9 +54,9 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | No log | 1.0 | 13 | 0.7451 |
62
- | No log | 2.0 | 26 | 0.8544 |
63
- | No log | 3.0 | 39 | 1.0189 |
64
 
65
 
66
  ### Framework versions
 
2
  license: gemma
3
  base_model: google/gemma-2-9b
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
19
 
20
  This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the alpaca_farm dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 1.4481
23
 
24
  ## Model description
25
 
 
39
 
40
  The following hyperparameters were used during training:
41
  - learning_rate: 2e-05
42
+ - train_batch_size: 1
43
+ - eval_batch_size: 1
44
  - seed: 42
45
  - distributed_type: multi-GPU
46
  - num_devices: 4
47
+ - total_train_batch_size: 4
48
+ - total_eval_batch_size: 4
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - num_epochs: 3
 
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.9381 | 1.0 | 2500 | 0.9376 |
58
+ | 0.4124 | 2.0 | 5000 | 1.0478 |
59
+ | 0.1515 | 3.0 | 7500 | 1.4481 |
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 1.0189313888549805,
4
- "eval_runtime": 1.2033,
5
- "eval_samples": 100,
6
- "eval_samples_per_second": 83.104,
7
- "eval_steps_per_second": 10.804,
8
- "total_flos": 2667373869924352.0,
9
- "train_loss": 0.4380333729279347,
10
- "train_runtime": 62.4476,
11
- "train_samples": 100,
12
- "train_samples_per_second": 4.804,
13
- "train_steps_per_second": 0.625
14
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 2.3924465601190298e+17,
4
+ "train_loss": 0.5169153635660807,
5
+ "train_runtime": 5042.7608,
6
+ "train_samples": 10000,
7
+ "train_samples_per_second": 5.949,
8
+ "train_steps_per_second": 1.487
 
 
 
 
 
9
  }
model-00001-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73be13f0027911e41266b06be2f0964f1769a87506d6ca16654821085a1194e6
3
  size 4903351912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5961b1d69d275c39dc069772d844b00dc76548033be4945b533f969d48cf2d2
3
  size 4903351912
model-00002-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22dad6692d8bf3170c55407b9beb1f31a83108383ba0b162882382dfc30297d5
3
  size 4947570872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9412bf9310b91d24ab66be912aaaca7e1b4a948ae143bf3a3cc544989ac5acf
3
  size 4947570872
model-00003-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:069d4a9c1dc6ecd4696dbf8a7ba5e92b86e3d581317f2c14567f0bc33764c7ca
3
  size 4962221464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ad58c2416f8d3e641c4e0a048d7e79be909b3504d20083fe875ccc83e67442
3
  size 4962221464
model-00004-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c18ea4aecc59553ba832c282834bc40d8021246d59f01789fee1524e564cc55f
3
  size 3670322200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f37d1089517faab0a6bd850d428abecdbe471a6fdd34d39e24e0d6092c17ea3d
3
  size 3670322200
model-00005-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:625618147894e35d0006e2761ed5343576f9c1ba0ef73232ecef83b4652dae0b
3
  size 1835008128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:005757341b2e5052d7c5f78e1807a373a49bdd0938d82a2328f33cad9a6579a5
3
  size 1835008128
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62aac1caf8a9d4c3f0bbcea6f3b568dc4c31697217cfb0a518a27db2e4da992a
3
- size 17518624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7da53ca29fb16f6b2489482fc0bc6a394162cdab14d12764a1755ebc583fea79
3
+ size 17518525
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 2667373869924352.0,
4
- "train_loss": 0.4380333729279347,
5
- "train_runtime": 62.4476,
6
- "train_samples": 100,
7
- "train_samples_per_second": 4.804,
8
- "train_steps_per_second": 0.625
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 2.3924465601190298e+17,
4
+ "train_loss": 0.5169153635660807,
5
+ "train_runtime": 5042.7608,
6
+ "train_samples": 10000,
7
+ "train_samples_per_second": 5.949,
8
+ "train_steps_per_second": 1.487
9
  }
trainer_state.json CHANGED
@@ -3,47 +3,152 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 39,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 1.0,
13
- "eval_loss": 0.7451236844062805,
14
- "eval_runtime": 1.2183,
15
- "eval_samples_per_second": 82.081,
16
- "eval_steps_per_second": 10.67,
17
- "step": 13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  },
19
  {
20
  "epoch": 2.0,
21
- "eval_loss": 0.8544089794158936,
22
- "eval_runtime": 1.1946,
23
- "eval_samples_per_second": 83.713,
24
- "eval_steps_per_second": 10.883,
25
- "step": 26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  },
27
  {
28
  "epoch": 3.0,
29
- "eval_loss": 1.0189313888549805,
30
- "eval_runtime": 1.2024,
31
- "eval_samples_per_second": 83.167,
32
- "eval_steps_per_second": 10.812,
33
- "step": 39
34
  },
35
  {
36
  "epoch": 3.0,
37
- "step": 39,
38
- "total_flos": 2667373869924352.0,
39
- "train_loss": 0.4380333729279347,
40
- "train_runtime": 62.4476,
41
- "train_samples_per_second": 4.804,
42
- "train_steps_per_second": 0.625
43
  }
44
  ],
45
  "logging_steps": 500,
46
- "max_steps": 39,
47
  "num_input_tokens_seen": 0,
48
  "num_train_epochs": 3,
49
  "save_steps": 500,
@@ -59,8 +164,8 @@
59
  "attributes": {}
60
  }
61
  },
62
- "total_flos": 2667373869924352.0,
63
- "train_batch_size": 2,
64
  "trial_name": null,
65
  "trial_params": null
66
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.2,
13
+ "grad_norm": 5.575812339782715,
14
+ "learning_rate": 1.9781476007338058e-05,
15
+ "loss": 0.9324,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.4,
20
+ "grad_norm": 4.583652973175049,
21
+ "learning_rate": 1.913545457642601e-05,
22
+ "loss": 0.9619,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.6,
27
+ "grad_norm": 3.9456448554992676,
28
+ "learning_rate": 1.8090169943749477e-05,
29
+ "loss": 0.979,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.8,
34
+ "grad_norm": 5.700883865356445,
35
+ "learning_rate": 1.6691306063588583e-05,
36
+ "loss": 0.9591,
37
+ "step": 2000
38
+ },
39
  {
40
  "epoch": 1.0,
41
+ "grad_norm": 3.7766757011413574,
42
+ "learning_rate": 1.5000000000000002e-05,
43
+ "loss": 0.9381,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 1.0,
48
+ "eval_loss": 0.937586784362793,
49
+ "eval_runtime": 43.9207,
50
+ "eval_samples_per_second": 45.537,
51
+ "eval_steps_per_second": 11.384,
52
+ "step": 2500
53
+ },
54
+ {
55
+ "epoch": 1.2,
56
+ "grad_norm": 4.477228164672852,
57
+ "learning_rate": 1.3090169943749475e-05,
58
+ "loss": 0.4559,
59
+ "step": 3000
60
+ },
61
+ {
62
+ "epoch": 1.4,
63
+ "grad_norm": 4.878340721130371,
64
+ "learning_rate": 1.1045284632676535e-05,
65
+ "loss": 0.4521,
66
+ "step": 3500
67
+ },
68
+ {
69
+ "epoch": 1.6,
70
+ "grad_norm": 3.9909627437591553,
71
+ "learning_rate": 8.954715367323468e-06,
72
+ "loss": 0.4468,
73
+ "step": 4000
74
+ },
75
+ {
76
+ "epoch": 1.8,
77
+ "grad_norm": 2.667670726776123,
78
+ "learning_rate": 6.909830056250527e-06,
79
+ "loss": 0.4321,
80
+ "step": 4500
81
+ },
82
+ {
83
+ "epoch": 2.0,
84
+ "grad_norm": 4.369731903076172,
85
+ "learning_rate": 5.000000000000003e-06,
86
+ "loss": 0.4124,
87
+ "step": 5000
88
  },
89
  {
90
  "epoch": 2.0,
91
+ "eval_loss": 1.0478131771087646,
92
+ "eval_runtime": 41.0932,
93
+ "eval_samples_per_second": 48.67,
94
+ "eval_steps_per_second": 12.167,
95
+ "step": 5000
96
+ },
97
+ {
98
+ "epoch": 2.2,
99
+ "grad_norm": 2.2593352794647217,
100
+ "learning_rate": 3.308693936411421e-06,
101
+ "loss": 0.1657,
102
+ "step": 5500
103
+ },
104
+ {
105
+ "epoch": 2.4,
106
+ "grad_norm": 2.3517982959747314,
107
+ "learning_rate": 1.9098300562505266e-06,
108
+ "loss": 0.1616,
109
+ "step": 6000
110
+ },
111
+ {
112
+ "epoch": 2.6,
113
+ "grad_norm": 2.2167694568634033,
114
+ "learning_rate": 8.645454235739903e-07,
115
+ "loss": 0.153,
116
+ "step": 6500
117
+ },
118
+ {
119
+ "epoch": 2.8,
120
+ "grad_norm": 2.2378008365631104,
121
+ "learning_rate": 2.1852399266194312e-07,
122
+ "loss": 0.1521,
123
+ "step": 7000
124
+ },
125
+ {
126
+ "epoch": 3.0,
127
+ "grad_norm": 1.863773226737976,
128
+ "learning_rate": 0.0,
129
+ "loss": 0.1515,
130
+ "step": 7500
131
  },
132
  {
133
  "epoch": 3.0,
134
+ "eval_loss": 1.4480849504470825,
135
+ "eval_runtime": 41.4492,
136
+ "eval_samples_per_second": 48.252,
137
+ "eval_steps_per_second": 12.063,
138
+ "step": 7500
139
  },
140
  {
141
  "epoch": 3.0,
142
+ "step": 7500,
143
+ "total_flos": 2.3924465601190298e+17,
144
+ "train_loss": 0.5169153635660807,
145
+ "train_runtime": 5042.7608,
146
+ "train_samples_per_second": 5.949,
147
+ "train_steps_per_second": 1.487
148
  }
149
  ],
150
  "logging_steps": 500,
151
+ "max_steps": 7500,
152
  "num_input_tokens_seen": 0,
153
  "num_train_epochs": 3,
154
  "save_steps": 500,
 
164
  "attributes": {}
165
  }
166
  },
167
+ "total_flos": 2.3924465601190298e+17,
168
+ "train_batch_size": 1,
169
  "trial_name": null,
170
  "trial_params": null
171
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ce09131e321f54043af13f56c4236677dd675ebddec7aef867688091507c91a
3
  size 6520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e0430645b73c3ab0a8f9201865b6bc0ad3d630c39c2224f820bdb6c9006a4db
3
  size 6520