ChenWu98 commited on
Commit
3e02bbf
·
verified ·
1 Parent(s): fa6635b

Model save

Browse files
Files changed (5) hide show
  1. README.md +68 -0
  2. all_results.json +13 -0
  3. eval_results.json +8 -0
  4. train_results.json +8 -0
  5. trainer_state.json +152 -0
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: HuggingFaceH4/zephyr-7b-beta
9
+ model-index:
10
+ - name: skills_metaphor_chat-lora
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # skills_metaphor_chat-lora
18
+
19
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.2110
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0002
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - gradient_accumulation_steps: 4
46
+ - total_train_batch_size: 16
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 4.0
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss |
55
+ |:-------------:|:-----:|:----:|:---------------:|
56
+ | 0.2706 | 0.96 | 18 | 0.2212 |
57
+ | 0.173 | 1.97 | 37 | 0.2041 |
58
+ | 0.1473 | 2.99 | 56 | 0.2078 |
59
+ | 0.1194 | 3.84 | 72 | 0.2110 |
60
+
61
+
62
+ ### Framework versions
63
+
64
+ - PEFT 0.7.1
65
+ - Transformers 4.37.2
66
+ - Pytorch 2.1.2+cu121
67
+ - Datasets 2.14.6
68
+ - Tokenizers 0.15.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.84,
3
+ "eval_loss": 0.2110341489315033,
4
+ "eval_runtime": 4.1152,
5
+ "eval_samples": 100,
6
+ "eval_samples_per_second": 24.3,
7
+ "eval_steps_per_second": 3.159,
8
+ "train_loss": 0.3370748994251092,
9
+ "train_runtime": 404.182,
10
+ "train_samples": 300,
11
+ "train_samples_per_second": 2.969,
12
+ "train_steps_per_second": 0.178
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.84,
3
+ "eval_loss": 0.2110341489315033,
4
+ "eval_runtime": 4.1152,
5
+ "eval_samples": 100,
6
+ "eval_samples_per_second": 24.3,
7
+ "eval_steps_per_second": 3.159
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.84,
3
+ "train_loss": 0.3370748994251092,
4
+ "train_runtime": 404.182,
5
+ "train_samples": 300,
6
+ "train_samples_per_second": 2.969,
7
+ "train_steps_per_second": 0.178
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.84,
5
+ "eval_steps": 500,
6
+ "global_step": 72,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05,
13
+ "learning_rate": 2.5e-05,
14
+ "loss": 2.1043,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.27,
19
+ "learning_rate": 0.000125,
20
+ "loss": 1.9092,
21
+ "step": 5
22
+ },
23
+ {
24
+ "epoch": 0.53,
25
+ "learning_rate": 0.0001995184726672197,
26
+ "loss": 0.8641,
27
+ "step": 10
28
+ },
29
+ {
30
+ "epoch": 0.8,
31
+ "learning_rate": 0.00019415440651830208,
32
+ "loss": 0.2706,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.96,
37
+ "eval_loss": 0.22117801010608673,
38
+ "eval_runtime": 5.0546,
39
+ "eval_samples_per_second": 19.784,
40
+ "eval_steps_per_second": 2.572,
41
+ "step": 18
42
+ },
43
+ {
44
+ "epoch": 1.07,
45
+ "learning_rate": 0.00018314696123025454,
46
+ "loss": 0.2201,
47
+ "step": 20
48
+ },
49
+ {
50
+ "epoch": 1.33,
51
+ "learning_rate": 0.00016715589548470185,
52
+ "loss": 0.1899,
53
+ "step": 25
54
+ },
55
+ {
56
+ "epoch": 1.6,
57
+ "learning_rate": 0.0001471396736825998,
58
+ "loss": 0.1831,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 1.87,
63
+ "learning_rate": 0.0001242980179903264,
64
+ "loss": 0.173,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 1.97,
69
+ "eval_loss": 0.2041378617286682,
70
+ "eval_runtime": 4.1292,
71
+ "eval_samples_per_second": 24.218,
72
+ "eval_steps_per_second": 3.148,
73
+ "step": 37
74
+ },
75
+ {
76
+ "epoch": 2.13,
77
+ "learning_rate": 0.0001,
78
+ "loss": 0.1584,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 2.4,
83
+ "learning_rate": 7.570198200967362e-05,
84
+ "loss": 0.1522,
85
+ "step": 45
86
+ },
87
+ {
88
+ "epoch": 2.67,
89
+ "learning_rate": 5.286032631740023e-05,
90
+ "loss": 0.1322,
91
+ "step": 50
92
+ },
93
+ {
94
+ "epoch": 2.93,
95
+ "learning_rate": 3.2844104515298155e-05,
96
+ "loss": 0.1473,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 2.99,
101
+ "eval_loss": 0.20777387917041779,
102
+ "eval_runtime": 4.0732,
103
+ "eval_samples_per_second": 24.551,
104
+ "eval_steps_per_second": 3.192,
105
+ "step": 56
106
+ },
107
+ {
108
+ "epoch": 3.2,
109
+ "learning_rate": 1.6853038769745467e-05,
110
+ "loss": 0.1254,
111
+ "step": 60
112
+ },
113
+ {
114
+ "epoch": 3.47,
115
+ "learning_rate": 5.8455934816979305e-06,
116
+ "loss": 0.122,
117
+ "step": 65
118
+ },
119
+ {
120
+ "epoch": 3.73,
121
+ "learning_rate": 4.815273327803182e-07,
122
+ "loss": 0.1194,
123
+ "step": 70
124
+ },
125
+ {
126
+ "epoch": 3.84,
127
+ "eval_loss": 0.2110341489315033,
128
+ "eval_runtime": 4.0834,
129
+ "eval_samples_per_second": 24.49,
130
+ "eval_steps_per_second": 3.184,
131
+ "step": 72
132
+ },
133
+ {
134
+ "epoch": 3.84,
135
+ "step": 72,
136
+ "total_flos": 38498470559744.0,
137
+ "train_loss": 0.3370748994251092,
138
+ "train_runtime": 404.182,
139
+ "train_samples_per_second": 2.969,
140
+ "train_steps_per_second": 0.178
141
+ }
142
+ ],
143
+ "logging_steps": 5,
144
+ "max_steps": 72,
145
+ "num_input_tokens_seen": 0,
146
+ "num_train_epochs": 4,
147
+ "save_steps": 500,
148
+ "total_flos": 38498470559744.0,
149
+ "train_batch_size": 4,
150
+ "trial_name": null,
151
+ "trial_params": null
152
+ }