Jae-star commited on
Commit
824abe0
·
verified ·
1 Parent(s): 266618f

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +68 -0
  2. checkpoint-40000/config.json +30 -0
  3. checkpoint-40000/generation_config.json +7 -0
  4. checkpoint-40000/model.safetensors +3 -0
  5. checkpoint-40000/optimizer.pt +3 -0
  6. checkpoint-40000/rng_state.pth +3 -0
  7. checkpoint-40000/scheduler.pt +3 -0
  8. checkpoint-40000/special_tokens_map.json +24 -0
  9. checkpoint-40000/tokenizer.model +3 -0
  10. checkpoint-40000/tokenizer_config.json +35 -0
  11. checkpoint-40000/trainer_state.json +189 -0
  12. checkpoint-40000/training_args.bin +3 -0
  13. checkpoint-45000/config.json +30 -0
  14. checkpoint-45000/generation_config.json +7 -0
  15. checkpoint-45000/model.safetensors +3 -0
  16. checkpoint-45000/optimizer.pt +3 -0
  17. checkpoint-45000/rng_state.pth +3 -0
  18. checkpoint-45000/scheduler.pt +3 -0
  19. checkpoint-45000/special_tokens_map.json +24 -0
  20. checkpoint-45000/tokenizer.model +3 -0
  21. checkpoint-45000/tokenizer_config.json +35 -0
  22. checkpoint-45000/trainer_state.json +211 -0
  23. checkpoint-45000/training_args.bin +3 -0
  24. checkpoint-50000/config.json +30 -0
  25. checkpoint-50000/generation_config.json +7 -0
  26. checkpoint-50000/model.safetensors +3 -0
  27. checkpoint-50000/optimizer.pt +3 -0
  28. checkpoint-50000/rng_state.pth +3 -0
  29. checkpoint-50000/scheduler.pt +3 -0
  30. checkpoint-50000/special_tokens_map.json +24 -0
  31. checkpoint-50000/tokenizer.model +3 -0
  32. checkpoint-50000/tokenizer_config.json +35 -0
  33. checkpoint-50000/trainer_state.json +226 -0
  34. checkpoint-50000/training_args.bin +3 -0
  35. checkpoint-55000/config.json +30 -0
  36. checkpoint-55000/generation_config.json +7 -0
  37. checkpoint-55000/model.safetensors +3 -0
  38. checkpoint-55000/optimizer.pt +3 -0
  39. checkpoint-55000/rng_state.pth +3 -0
  40. checkpoint-55000/scheduler.pt +3 -0
  41. checkpoint-55000/special_tokens_map.json +24 -0
  42. checkpoint-55000/tokenizer.model +3 -0
  43. checkpoint-55000/tokenizer_config.json +35 -0
  44. checkpoint-55000/trainer_state.json +248 -0
  45. checkpoint-55000/training_args.bin +3 -0
  46. checkpoint-60000/config.json +30 -0
  47. checkpoint-60000/generation_config.json +7 -0
  48. checkpoint-60000/model.safetensors +3 -0
  49. checkpoint-60000/optimizer.pt +3 -0
  50. checkpoint-60000/rng_state.pth +3 -0
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: llama-fin
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ # llama-fin
14
+
15
+ This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 1.2086
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 0.0003
37
+ - train_batch_size: 32
38
+ - eval_batch_size: 32
39
+ - seed: 42
40
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
41
+ - lr_scheduler_type: cosine
42
+ - lr_scheduler_warmup_steps: 1000
43
+ - num_epochs: 2
44
+
45
+ ### Training results
46
+
47
+ | Training Loss | Epoch | Step | Validation Loss |
48
+ |:-------------:|:------:|:-----:|:---------------:|
49
+ | 3.0634 | 0.1593 | 5000 | 1.6380 |
50
+ | 1.5345 | 0.3185 | 10000 | 1.4842 |
51
+ | 1.4255 | 0.4778 | 15000 | 1.4151 |
52
+ | 1.3929 | 0.6370 | 20000 | 1.3720 |
53
+ | 1.3462 | 0.7963 | 25000 | 1.3367 |
54
+ | 1.3094 | 0.9555 | 30000 | 1.3087 |
55
+ | 1.2835 | 1.1148 | 35000 | 1.2838 |
56
+ | 1.2534 | 1.2740 | 40000 | 1.2605 |
57
+ | 1.2303 | 1.4333 | 45000 | 1.2407 |
58
+ | 1.2187 | 1.5926 | 50000 | 1.2244 |
59
+ | 1.2001 | 1.7518 | 55000 | 1.2133 |
60
+ | 1.1937 | 1.9111 | 60000 | 1.2086 |
61
+
62
+
63
+ ### Framework versions
64
+
65
+ - Transformers 4.51.3
66
+ - Pytorch 2.1.0+cu118
67
+ - Datasets 3.5.0
68
+ - Tokenizers 0.21.1
checkpoint-40000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 32,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 256,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 512,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 30,
19
+ "num_key_value_heads": 8,
20
+ "pad_token_id": 2,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-40000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-40000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:864546b2115d06e4615fc33352ac622e9ad2372f57dcefb0c724e56127bb1e70
3
+ size 111503488
checkpoint-40000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:234d26946c38b6ab35294a1fe3ab6e3d77d3b751822980376a5de3a810ff8ff4
3
+ size 223176442
checkpoint-40000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d245e05e72192c132e0f2edb6fdcae0c578c890f0fe912f17ec7b0bba2d38cc3
3
+ size 14244
checkpoint-40000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6d6bf6fbcda20051d0ed7c9373fc7da2d2e120cb1ffdd71dce90a7adb1c857f
3
+ size 1064
checkpoint-40000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-40000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-40000/tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "1": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "2": {
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ }
22
+ },
23
+ "bos_token": "<s>",
24
+ "clean_up_tokenization_spaces": false,
25
+ "eos_token": "</s>",
26
+ "extra_special_tokens": {},
27
+ "legacy": true,
28
+ "model_max_length": 1000000000000000019884624838656,
29
+ "pad_token": "</s>",
30
+ "sp_model_kwargs": {},
31
+ "spaces_between_special_tokens": false,
32
+ "tokenizer_class": "LlamaTokenizer",
33
+ "unk_token": "",
34
+ "use_default_system_prompt": false
35
+ }
checkpoint-40000/trainer_state.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": 1.260542392730713,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.274047649382087,
6
+ "eval_steps": 5000,
7
+ "global_step": 40000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09555357370365651,
14
+ "grad_norm": 0.6688508987426758,
15
+ "learning_rate": 0.0002992259870276421,
16
+ "loss": 3.0634,
17
+ "step": 3000
18
+ },
19
+ {
20
+ "epoch": 0.15925595617276087,
21
+ "eval_loss": 1.6379895210266113,
22
+ "eval_runtime": 36.9077,
23
+ "eval_samples_per_second": 273.764,
24
+ "eval_steps_per_second": 8.562,
25
+ "step": 5000
26
+ },
27
+ {
28
+ "epoch": 0.19110714740731302,
29
+ "grad_norm": 0.5710486173629761,
30
+ "learning_rate": 0.00029518136830057303,
31
+ "loss": 1.6711,
32
+ "step": 6000
33
+ },
34
+ {
35
+ "epoch": 0.2866607211109696,
36
+ "grad_norm": 0.5231711864471436,
37
+ "learning_rate": 0.00028776584441877383,
38
+ "loss": 1.5345,
39
+ "step": 9000
40
+ },
41
+ {
42
+ "epoch": 0.31851191234552173,
43
+ "eval_loss": 1.4841793775558472,
44
+ "eval_runtime": 36.8712,
45
+ "eval_samples_per_second": 274.035,
46
+ "eval_steps_per_second": 8.57,
47
+ "step": 10000
48
+ },
49
+ {
50
+ "epoch": 0.38221429481462604,
51
+ "grad_norm": 0.4970897436141968,
52
+ "learning_rate": 0.00027715159331368833,
53
+ "loss": 1.4714,
54
+ "step": 12000
55
+ },
56
+ {
57
+ "epoch": 0.47776786851828257,
58
+ "grad_norm": 0.5002285242080688,
59
+ "learning_rate": 0.0002635850628193778,
60
+ "loss": 1.4255,
61
+ "step": 15000
62
+ },
63
+ {
64
+ "epoch": 0.47776786851828257,
65
+ "eval_loss": 1.4150956869125366,
66
+ "eval_runtime": 36.8321,
67
+ "eval_samples_per_second": 274.326,
68
+ "eval_steps_per_second": 8.579,
69
+ "step": 15000
70
+ },
71
+ {
72
+ "epoch": 0.5733214422219391,
73
+ "grad_norm": 0.49311497807502747,
74
+ "learning_rate": 0.0002473812485038713,
75
+ "loss": 1.3929,
76
+ "step": 18000
77
+ },
78
+ {
79
+ "epoch": 0.6370238246910435,
80
+ "eval_loss": 1.3720492124557495,
81
+ "eval_runtime": 36.8585,
82
+ "eval_samples_per_second": 274.13,
83
+ "eval_steps_per_second": 8.573,
84
+ "step": 20000
85
+ },
86
+ {
87
+ "epoch": 0.6688750159255956,
88
+ "grad_norm": 0.5144098401069641,
89
+ "learning_rate": 0.00022891637991949928,
90
+ "loss": 1.3669,
91
+ "step": 21000
92
+ },
93
+ {
94
+ "epoch": 0.7644285896292521,
95
+ "grad_norm": 0.5176816582679749,
96
+ "learning_rate": 0.00020861918508708878,
97
+ "loss": 1.3462,
98
+ "step": 24000
99
+ },
100
+ {
101
+ "epoch": 0.7962797808638044,
102
+ "eval_loss": 1.3367421627044678,
103
+ "eval_runtime": 36.8485,
104
+ "eval_samples_per_second": 274.204,
105
+ "eval_steps_per_second": 8.576,
106
+ "step": 25000
107
+ },
108
+ {
109
+ "epoch": 0.8599821633329087,
110
+ "grad_norm": 0.5269707441329956,
111
+ "learning_rate": 0.00018696093604028994,
112
+ "loss": 1.3283,
113
+ "step": 27000
114
+ },
115
+ {
116
+ "epoch": 0.9555357370365651,
117
+ "grad_norm": 0.5370417237281799,
118
+ "learning_rate": 0.0001644445065583549,
119
+ "loss": 1.3094,
120
+ "step": 30000
121
+ },
122
+ {
123
+ "epoch": 0.9555357370365651,
124
+ "eval_loss": 1.3087339401245117,
125
+ "eval_runtime": 36.9277,
126
+ "eval_samples_per_second": 273.616,
127
+ "eval_steps_per_second": 8.557,
128
+ "step": 30000
129
+ },
130
+ {
131
+ "epoch": 1.0510893107402217,
132
+ "grad_norm": 0.5604170560836792,
133
+ "learning_rate": 0.00014159269615127048,
134
+ "loss": 1.2835,
135
+ "step": 33000
136
+ },
137
+ {
138
+ "epoch": 1.114791693209326,
139
+ "eval_loss": 1.283848762512207,
140
+ "eval_runtime": 36.9334,
141
+ "eval_samples_per_second": 273.573,
142
+ "eval_steps_per_second": 8.556,
143
+ "step": 35000
144
+ },
145
+ {
146
+ "epoch": 1.1466428844438783,
147
+ "grad_norm": 0.5592519044876099,
148
+ "learning_rate": 0.00011893609139772691,
149
+ "loss": 1.265,
150
+ "step": 36000
151
+ },
152
+ {
153
+ "epoch": 1.2421964581475347,
154
+ "grad_norm": 0.594070315361023,
155
+ "learning_rate": 9.700074647841136e-05,
156
+ "loss": 1.2534,
157
+ "step": 39000
158
+ },
159
+ {
160
+ "epoch": 1.274047649382087,
161
+ "eval_loss": 1.260542392730713,
162
+ "eval_runtime": 36.9536,
163
+ "eval_samples_per_second": 273.424,
164
+ "eval_steps_per_second": 8.551,
165
+ "step": 40000
166
+ }
167
+ ],
168
+ "logging_steps": 3000,
169
+ "max_steps": 62792,
170
+ "num_input_tokens_seen": 0,
171
+ "num_train_epochs": 2,
172
+ "save_steps": 500,
173
+ "stateful_callbacks": {
174
+ "TrainerControl": {
175
+ "args": {
176
+ "should_epoch_stop": false,
177
+ "should_evaluate": false,
178
+ "should_log": false,
179
+ "should_save": true,
180
+ "should_training_stop": false
181
+ },
182
+ "attributes": {}
183
+ }
184
+ },
185
+ "total_flos": 7.737039281691034e+16,
186
+ "train_batch_size": 32,
187
+ "trial_name": null,
188
+ "trial_params": null
189
+ }
checkpoint-40000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47dea390da80729e81a7320baa573ab0bba14035962ea3bf179f44ed8e91a0d9
3
+ size 5304
checkpoint-45000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 32,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 256,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 512,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 30,
19
+ "num_key_value_heads": 8,
20
+ "pad_token_id": 2,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-45000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-45000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4d4c55a0d3b55d3c33a3e7c772c42da53586fbbc31295bd20ba9d213a57bb1e
3
+ size 111503488
checkpoint-45000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7636c39ee46f24ef0e7e6f60d06533989a5e1d760ef77d995331baaa945925
3
+ size 223176442
checkpoint-45000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5edb34d031c0c2b447f3eaadb401a4c1e7e7e6d8c096e28b7092e01a8bd48c92
3
+ size 14244
checkpoint-45000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:400f89ab81606f66afc30799e6e50a56917758a20d3c6664119029c6a34c6ad9
3
+ size 1064
checkpoint-45000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-45000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-45000/tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "1": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "2": {
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ }
22
+ },
23
+ "bos_token": "<s>",
24
+ "clean_up_tokenization_spaces": false,
25
+ "eos_token": "</s>",
26
+ "extra_special_tokens": {},
27
+ "legacy": true,
28
+ "model_max_length": 1000000000000000019884624838656,
29
+ "pad_token": "</s>",
30
+ "sp_model_kwargs": {},
31
+ "spaces_between_special_tokens": false,
32
+ "tokenizer_class": "LlamaTokenizer",
33
+ "unk_token": "",
34
+ "use_default_system_prompt": false
35
+ }
checkpoint-45000/trainer_state.json ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": 1.2406779527664185,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.4333036055548478,
6
+ "eval_steps": 5000,
7
+ "global_step": 45000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09555357370365651,
14
+ "grad_norm": 0.6688508987426758,
15
+ "learning_rate": 0.0002992259870276421,
16
+ "loss": 3.0634,
17
+ "step": 3000
18
+ },
19
+ {
20
+ "epoch": 0.15925595617276087,
21
+ "eval_loss": 1.6379895210266113,
22
+ "eval_runtime": 36.9077,
23
+ "eval_samples_per_second": 273.764,
24
+ "eval_steps_per_second": 8.562,
25
+ "step": 5000
26
+ },
27
+ {
28
+ "epoch": 0.19110714740731302,
29
+ "grad_norm": 0.5710486173629761,
30
+ "learning_rate": 0.00029518136830057303,
31
+ "loss": 1.6711,
32
+ "step": 6000
33
+ },
34
+ {
35
+ "epoch": 0.2866607211109696,
36
+ "grad_norm": 0.5231711864471436,
37
+ "learning_rate": 0.00028776584441877383,
38
+ "loss": 1.5345,
39
+ "step": 9000
40
+ },
41
+ {
42
+ "epoch": 0.31851191234552173,
43
+ "eval_loss": 1.4841793775558472,
44
+ "eval_runtime": 36.8712,
45
+ "eval_samples_per_second": 274.035,
46
+ "eval_steps_per_second": 8.57,
47
+ "step": 10000
48
+ },
49
+ {
50
+ "epoch": 0.38221429481462604,
51
+ "grad_norm": 0.4970897436141968,
52
+ "learning_rate": 0.00027715159331368833,
53
+ "loss": 1.4714,
54
+ "step": 12000
55
+ },
56
+ {
57
+ "epoch": 0.47776786851828257,
58
+ "grad_norm": 0.5002285242080688,
59
+ "learning_rate": 0.0002635850628193778,
60
+ "loss": 1.4255,
61
+ "step": 15000
62
+ },
63
+ {
64
+ "epoch": 0.47776786851828257,
65
+ "eval_loss": 1.4150956869125366,
66
+ "eval_runtime": 36.8321,
67
+ "eval_samples_per_second": 274.326,
68
+ "eval_steps_per_second": 8.579,
69
+ "step": 15000
70
+ },
71
+ {
72
+ "epoch": 0.5733214422219391,
73
+ "grad_norm": 0.49311497807502747,
74
+ "learning_rate": 0.0002473812485038713,
75
+ "loss": 1.3929,
76
+ "step": 18000
77
+ },
78
+ {
79
+ "epoch": 0.6370238246910435,
80
+ "eval_loss": 1.3720492124557495,
81
+ "eval_runtime": 36.8585,
82
+ "eval_samples_per_second": 274.13,
83
+ "eval_steps_per_second": 8.573,
84
+ "step": 20000
85
+ },
86
+ {
87
+ "epoch": 0.6688750159255956,
88
+ "grad_norm": 0.5144098401069641,
89
+ "learning_rate": 0.00022891637991949928,
90
+ "loss": 1.3669,
91
+ "step": 21000
92
+ },
93
+ {
94
+ "epoch": 0.7644285896292521,
95
+ "grad_norm": 0.5176816582679749,
96
+ "learning_rate": 0.00020861918508708878,
97
+ "loss": 1.3462,
98
+ "step": 24000
99
+ },
100
+ {
101
+ "epoch": 0.7962797808638044,
102
+ "eval_loss": 1.3367421627044678,
103
+ "eval_runtime": 36.8485,
104
+ "eval_samples_per_second": 274.204,
105
+ "eval_steps_per_second": 8.576,
106
+ "step": 25000
107
+ },
108
+ {
109
+ "epoch": 0.8599821633329087,
110
+ "grad_norm": 0.5269707441329956,
111
+ "learning_rate": 0.00018696093604028994,
112
+ "loss": 1.3283,
113
+ "step": 27000
114
+ },
115
+ {
116
+ "epoch": 0.9555357370365651,
117
+ "grad_norm": 0.5370417237281799,
118
+ "learning_rate": 0.0001644445065583549,
119
+ "loss": 1.3094,
120
+ "step": 30000
121
+ },
122
+ {
123
+ "epoch": 0.9555357370365651,
124
+ "eval_loss": 1.3087339401245117,
125
+ "eval_runtime": 36.9277,
126
+ "eval_samples_per_second": 273.616,
127
+ "eval_steps_per_second": 8.557,
128
+ "step": 30000
129
+ },
130
+ {
131
+ "epoch": 1.0510893107402217,
132
+ "grad_norm": 0.5604170560836792,
133
+ "learning_rate": 0.00014159269615127048,
134
+ "loss": 1.2835,
135
+ "step": 33000
136
+ },
137
+ {
138
+ "epoch": 1.114791693209326,
139
+ "eval_loss": 1.283848762512207,
140
+ "eval_runtime": 36.9334,
141
+ "eval_samples_per_second": 273.573,
142
+ "eval_steps_per_second": 8.556,
143
+ "step": 35000
144
+ },
145
+ {
146
+ "epoch": 1.1466428844438783,
147
+ "grad_norm": 0.5592519044876099,
148
+ "learning_rate": 0.00011893609139772691,
149
+ "loss": 1.265,
150
+ "step": 36000
151
+ },
152
+ {
153
+ "epoch": 1.2421964581475347,
154
+ "grad_norm": 0.594070315361023,
155
+ "learning_rate": 9.700074647841136e-05,
156
+ "loss": 1.2534,
157
+ "step": 39000
158
+ },
159
+ {
160
+ "epoch": 1.274047649382087,
161
+ "eval_loss": 1.260542392730713,
162
+ "eval_runtime": 36.9536,
163
+ "eval_samples_per_second": 273.424,
164
+ "eval_steps_per_second": 8.551,
165
+ "step": 40000
166
+ },
167
+ {
168
+ "epoch": 1.3377500318511912,
169
+ "grad_norm": 0.6026796102523804,
170
+ "learning_rate": 7.629596894514561e-05,
171
+ "loss": 1.2422,
172
+ "step": 42000
173
+ },
174
+ {
175
+ "epoch": 1.4333036055548478,
176
+ "grad_norm": 0.5905404090881348,
177
+ "learning_rate": 5.7302494322958876e-05,
178
+ "loss": 1.2303,
179
+ "step": 45000
180
+ },
181
+ {
182
+ "epoch": 1.4333036055548478,
183
+ "eval_loss": 1.2406779527664185,
184
+ "eval_runtime": 37.226,
185
+ "eval_samples_per_second": 271.423,
186
+ "eval_steps_per_second": 8.489,
187
+ "step": 45000
188
+ }
189
+ ],
190
+ "logging_steps": 3000,
191
+ "max_steps": 62792,
192
+ "num_input_tokens_seen": 0,
193
+ "num_train_epochs": 2,
194
+ "save_steps": 500,
195
+ "stateful_callbacks": {
196
+ "TrainerControl": {
197
+ "args": {
198
+ "should_epoch_stop": false,
199
+ "should_evaluate": false,
200
+ "should_log": false,
201
+ "should_save": true,
202
+ "should_training_stop": false
203
+ },
204
+ "attributes": {}
205
+ }
206
+ },
207
+ "total_flos": 8.704174480923034e+16,
208
+ "train_batch_size": 32,
209
+ "trial_name": null,
210
+ "trial_params": null
211
+ }
checkpoint-45000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47dea390da80729e81a7320baa573ab0bba14035962ea3bf179f44ed8e91a0d9
3
+ size 5304
checkpoint-50000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 32,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 256,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 512,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 30,
19
+ "num_key_value_heads": 8,
20
+ "pad_token_id": 2,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-50000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-50000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3af2ad45fd40654c1497f7418106a41af852323d15342d41f60ca689dd4f34b
3
+ size 111503488
checkpoint-50000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cacbc08856cfb789a10c5f977dab052508daf0d5646680a7da3ba398d825fd49
3
+ size 223176442
checkpoint-50000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3e5d946241df2516b06d7074d8779088eae7607173ad780df56583910a9589b
3
+ size 14244
checkpoint-50000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cabec674ef0d60462518cf5c37f7fc011a2a644ee516e6b0525d88cbd7ebabd0
3
+ size 1064
checkpoint-50000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-50000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-50000/tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "1": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "2": {
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ }
22
+ },
23
+ "bos_token": "<s>",
24
+ "clean_up_tokenization_spaces": false,
25
+ "eos_token": "</s>",
26
+ "extra_special_tokens": {},
27
+ "legacy": true,
28
+ "model_max_length": 1000000000000000019884624838656,
29
+ "pad_token": "</s>",
30
+ "sp_model_kwargs": {},
31
+ "spaces_between_special_tokens": false,
32
+ "tokenizer_class": "LlamaTokenizer",
33
+ "unk_token": "",
34
+ "use_default_system_prompt": false
35
+ }
checkpoint-50000/trainer_state.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": 1.2243584394454956,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.5925595617276085,
6
+ "eval_steps": 5000,
7
+ "global_step": 50000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09555357370365651,
14
+ "grad_norm": 0.6688508987426758,
15
+ "learning_rate": 0.0002992259870276421,
16
+ "loss": 3.0634,
17
+ "step": 3000
18
+ },
19
+ {
20
+ "epoch": 0.15925595617276087,
21
+ "eval_loss": 1.6379895210266113,
22
+ "eval_runtime": 36.9077,
23
+ "eval_samples_per_second": 273.764,
24
+ "eval_steps_per_second": 8.562,
25
+ "step": 5000
26
+ },
27
+ {
28
+ "epoch": 0.19110714740731302,
29
+ "grad_norm": 0.5710486173629761,
30
+ "learning_rate": 0.00029518136830057303,
31
+ "loss": 1.6711,
32
+ "step": 6000
33
+ },
34
+ {
35
+ "epoch": 0.2866607211109696,
36
+ "grad_norm": 0.5231711864471436,
37
+ "learning_rate": 0.00028776584441877383,
38
+ "loss": 1.5345,
39
+ "step": 9000
40
+ },
41
+ {
42
+ "epoch": 0.31851191234552173,
43
+ "eval_loss": 1.4841793775558472,
44
+ "eval_runtime": 36.8712,
45
+ "eval_samples_per_second": 274.035,
46
+ "eval_steps_per_second": 8.57,
47
+ "step": 10000
48
+ },
49
+ {
50
+ "epoch": 0.38221429481462604,
51
+ "grad_norm": 0.4970897436141968,
52
+ "learning_rate": 0.00027715159331368833,
53
+ "loss": 1.4714,
54
+ "step": 12000
55
+ },
56
+ {
57
+ "epoch": 0.47776786851828257,
58
+ "grad_norm": 0.5002285242080688,
59
+ "learning_rate": 0.0002635850628193778,
60
+ "loss": 1.4255,
61
+ "step": 15000
62
+ },
63
+ {
64
+ "epoch": 0.47776786851828257,
65
+ "eval_loss": 1.4150956869125366,
66
+ "eval_runtime": 36.8321,
67
+ "eval_samples_per_second": 274.326,
68
+ "eval_steps_per_second": 8.579,
69
+ "step": 15000
70
+ },
71
+ {
72
+ "epoch": 0.5733214422219391,
73
+ "grad_norm": 0.49311497807502747,
74
+ "learning_rate": 0.0002473812485038713,
75
+ "loss": 1.3929,
76
+ "step": 18000
77
+ },
78
+ {
79
+ "epoch": 0.6370238246910435,
80
+ "eval_loss": 1.3720492124557495,
81
+ "eval_runtime": 36.8585,
82
+ "eval_samples_per_second": 274.13,
83
+ "eval_steps_per_second": 8.573,
84
+ "step": 20000
85
+ },
86
+ {
87
+ "epoch": 0.6688750159255956,
88
+ "grad_norm": 0.5144098401069641,
89
+ "learning_rate": 0.00022891637991949928,
90
+ "loss": 1.3669,
91
+ "step": 21000
92
+ },
93
+ {
94
+ "epoch": 0.7644285896292521,
95
+ "grad_norm": 0.5176816582679749,
96
+ "learning_rate": 0.00020861918508708878,
97
+ "loss": 1.3462,
98
+ "step": 24000
99
+ },
100
+ {
101
+ "epoch": 0.7962797808638044,
102
+ "eval_loss": 1.3367421627044678,
103
+ "eval_runtime": 36.8485,
104
+ "eval_samples_per_second": 274.204,
105
+ "eval_steps_per_second": 8.576,
106
+ "step": 25000
107
+ },
108
+ {
109
+ "epoch": 0.8599821633329087,
110
+ "grad_norm": 0.5269707441329956,
111
+ "learning_rate": 0.00018696093604028994,
112
+ "loss": 1.3283,
113
+ "step": 27000
114
+ },
115
+ {
116
+ "epoch": 0.9555357370365651,
117
+ "grad_norm": 0.5370417237281799,
118
+ "learning_rate": 0.0001644445065583549,
119
+ "loss": 1.3094,
120
+ "step": 30000
121
+ },
122
+ {
123
+ "epoch": 0.9555357370365651,
124
+ "eval_loss": 1.3087339401245117,
125
+ "eval_runtime": 36.9277,
126
+ "eval_samples_per_second": 273.616,
127
+ "eval_steps_per_second": 8.557,
128
+ "step": 30000
129
+ },
130
+ {
131
+ "epoch": 1.0510893107402217,
132
+ "grad_norm": 0.5604170560836792,
133
+ "learning_rate": 0.00014159269615127048,
134
+ "loss": 1.2835,
135
+ "step": 33000
136
+ },
137
+ {
138
+ "epoch": 1.114791693209326,
139
+ "eval_loss": 1.283848762512207,
140
+ "eval_runtime": 36.9334,
141
+ "eval_samples_per_second": 273.573,
142
+ "eval_steps_per_second": 8.556,
143
+ "step": 35000
144
+ },
145
+ {
146
+ "epoch": 1.1466428844438783,
147
+ "grad_norm": 0.5592519044876099,
148
+ "learning_rate": 0.00011893609139772691,
149
+ "loss": 1.265,
150
+ "step": 36000
151
+ },
152
+ {
153
+ "epoch": 1.2421964581475347,
154
+ "grad_norm": 0.594070315361023,
155
+ "learning_rate": 9.700074647841136e-05,
156
+ "loss": 1.2534,
157
+ "step": 39000
158
+ },
159
+ {
160
+ "epoch": 1.274047649382087,
161
+ "eval_loss": 1.260542392730713,
162
+ "eval_runtime": 36.9536,
163
+ "eval_samples_per_second": 273.424,
164
+ "eval_steps_per_second": 8.551,
165
+ "step": 40000
166
+ },
167
+ {
168
+ "epoch": 1.3377500318511912,
169
+ "grad_norm": 0.6026796102523804,
170
+ "learning_rate": 7.629596894514561e-05,
171
+ "loss": 1.2422,
172
+ "step": 42000
173
+ },
174
+ {
175
+ "epoch": 1.4333036055548478,
176
+ "grad_norm": 0.5905404090881348,
177
+ "learning_rate": 5.7302494322958876e-05,
178
+ "loss": 1.2303,
179
+ "step": 45000
180
+ },
181
+ {
182
+ "epoch": 1.4333036055548478,
183
+ "eval_loss": 1.2406779527664185,
184
+ "eval_runtime": 37.226,
185
+ "eval_samples_per_second": 271.423,
186
+ "eval_steps_per_second": 8.489,
187
+ "step": 45000
188
+ },
189
+ {
190
+ "epoch": 1.5288571792585044,
191
+ "grad_norm": 0.6186478137969971,
192
+ "learning_rate": 4.046132411403847e-05,
193
+ "loss": 1.2187,
194
+ "step": 48000
195
+ },
196
+ {
197
+ "epoch": 1.5925595617276085,
198
+ "eval_loss": 1.2243584394454956,
199
+ "eval_runtime": 37.1965,
200
+ "eval_samples_per_second": 271.639,
201
+ "eval_steps_per_second": 8.495,
202
+ "step": 50000
203
+ }
204
+ ],
205
+ "logging_steps": 3000,
206
+ "max_steps": 62792,
207
+ "num_input_tokens_seen": 0,
208
+ "num_train_epochs": 2,
209
+ "save_steps": 500,
210
+ "stateful_callbacks": {
211
+ "TrainerControl": {
212
+ "args": {
213
+ "should_epoch_stop": false,
214
+ "should_evaluate": false,
215
+ "should_log": false,
216
+ "should_save": true,
217
+ "should_training_stop": false
218
+ },
219
+ "attributes": {}
220
+ }
221
+ },
222
+ "total_flos": 9.671309680155034e+16,
223
+ "train_batch_size": 32,
224
+ "trial_name": null,
225
+ "trial_params": null
226
+ }
checkpoint-50000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47dea390da80729e81a7320baa573ab0bba14035962ea3bf179f44ed8e91a0d9
3
+ size 5304
checkpoint-55000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 32,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 256,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 512,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 30,
19
+ "num_key_value_heads": 8,
20
+ "pad_token_id": 2,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-55000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-55000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90bd10caf84bff659cebdee378fa4ec0baa3854af5ac028859034e0977760b97
3
+ size 111503488
checkpoint-55000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58810b8f3ab1618999d680aad83e4453f866f49df92c46bdbccb7dd22fb9b79e
3
+ size 223176442
checkpoint-55000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7682299c566684ea51cf26f0c86b6ffaa3c0bc63cbdf84674b29a2c62ac72143
3
+ size 14244
checkpoint-55000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c5ed0cd7a0667e74d3cccf18cf9c8e09dedf9bcbe1c5d32ff54dcd38e07805f
3
+ size 1064
checkpoint-55000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-55000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-55000/tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "1": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "2": {
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ }
22
+ },
23
+ "bos_token": "<s>",
24
+ "clean_up_tokenization_spaces": false,
25
+ "eos_token": "</s>",
26
+ "extra_special_tokens": {},
27
+ "legacy": true,
28
+ "model_max_length": 1000000000000000019884624838656,
29
+ "pad_token": "</s>",
30
+ "sp_model_kwargs": {},
31
+ "spaces_between_special_tokens": false,
32
+ "tokenizer_class": "LlamaTokenizer",
33
+ "unk_token": "",
34
+ "use_default_system_prompt": false
35
+ }
checkpoint-55000/trainer_state.json ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": 1.2133299112319946,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.7518155179003694,
6
+ "eval_steps": 5000,
7
+ "global_step": 55000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09555357370365651,
14
+ "grad_norm": 0.6688508987426758,
15
+ "learning_rate": 0.0002992259870276421,
16
+ "loss": 3.0634,
17
+ "step": 3000
18
+ },
19
+ {
20
+ "epoch": 0.15925595617276087,
21
+ "eval_loss": 1.6379895210266113,
22
+ "eval_runtime": 36.9077,
23
+ "eval_samples_per_second": 273.764,
24
+ "eval_steps_per_second": 8.562,
25
+ "step": 5000
26
+ },
27
+ {
28
+ "epoch": 0.19110714740731302,
29
+ "grad_norm": 0.5710486173629761,
30
+ "learning_rate": 0.00029518136830057303,
31
+ "loss": 1.6711,
32
+ "step": 6000
33
+ },
34
+ {
35
+ "epoch": 0.2866607211109696,
36
+ "grad_norm": 0.5231711864471436,
37
+ "learning_rate": 0.00028776584441877383,
38
+ "loss": 1.5345,
39
+ "step": 9000
40
+ },
41
+ {
42
+ "epoch": 0.31851191234552173,
43
+ "eval_loss": 1.4841793775558472,
44
+ "eval_runtime": 36.8712,
45
+ "eval_samples_per_second": 274.035,
46
+ "eval_steps_per_second": 8.57,
47
+ "step": 10000
48
+ },
49
+ {
50
+ "epoch": 0.38221429481462604,
51
+ "grad_norm": 0.4970897436141968,
52
+ "learning_rate": 0.00027715159331368833,
53
+ "loss": 1.4714,
54
+ "step": 12000
55
+ },
56
+ {
57
+ "epoch": 0.47776786851828257,
58
+ "grad_norm": 0.5002285242080688,
59
+ "learning_rate": 0.0002635850628193778,
60
+ "loss": 1.4255,
61
+ "step": 15000
62
+ },
63
+ {
64
+ "epoch": 0.47776786851828257,
65
+ "eval_loss": 1.4150956869125366,
66
+ "eval_runtime": 36.8321,
67
+ "eval_samples_per_second": 274.326,
68
+ "eval_steps_per_second": 8.579,
69
+ "step": 15000
70
+ },
71
+ {
72
+ "epoch": 0.5733214422219391,
73
+ "grad_norm": 0.49311497807502747,
74
+ "learning_rate": 0.0002473812485038713,
75
+ "loss": 1.3929,
76
+ "step": 18000
77
+ },
78
+ {
79
+ "epoch": 0.6370238246910435,
80
+ "eval_loss": 1.3720492124557495,
81
+ "eval_runtime": 36.8585,
82
+ "eval_samples_per_second": 274.13,
83
+ "eval_steps_per_second": 8.573,
84
+ "step": 20000
85
+ },
86
+ {
87
+ "epoch": 0.6688750159255956,
88
+ "grad_norm": 0.5144098401069641,
89
+ "learning_rate": 0.00022891637991949928,
90
+ "loss": 1.3669,
91
+ "step": 21000
92
+ },
93
+ {
94
+ "epoch": 0.7644285896292521,
95
+ "grad_norm": 0.5176816582679749,
96
+ "learning_rate": 0.00020861918508708878,
97
+ "loss": 1.3462,
98
+ "step": 24000
99
+ },
100
+ {
101
+ "epoch": 0.7962797808638044,
102
+ "eval_loss": 1.3367421627044678,
103
+ "eval_runtime": 36.8485,
104
+ "eval_samples_per_second": 274.204,
105
+ "eval_steps_per_second": 8.576,
106
+ "step": 25000
107
+ },
108
+ {
109
+ "epoch": 0.8599821633329087,
110
+ "grad_norm": 0.5269707441329956,
111
+ "learning_rate": 0.00018696093604028994,
112
+ "loss": 1.3283,
113
+ "step": 27000
114
+ },
115
+ {
116
+ "epoch": 0.9555357370365651,
117
+ "grad_norm": 0.5370417237281799,
118
+ "learning_rate": 0.0001644445065583549,
119
+ "loss": 1.3094,
120
+ "step": 30000
121
+ },
122
+ {
123
+ "epoch": 0.9555357370365651,
124
+ "eval_loss": 1.3087339401245117,
125
+ "eval_runtime": 36.9277,
126
+ "eval_samples_per_second": 273.616,
127
+ "eval_steps_per_second": 8.557,
128
+ "step": 30000
129
+ },
130
+ {
131
+ "epoch": 1.0510893107402217,
132
+ "grad_norm": 0.5604170560836792,
133
+ "learning_rate": 0.00014159269615127048,
134
+ "loss": 1.2835,
135
+ "step": 33000
136
+ },
137
+ {
138
+ "epoch": 1.114791693209326,
139
+ "eval_loss": 1.283848762512207,
140
+ "eval_runtime": 36.9334,
141
+ "eval_samples_per_second": 273.573,
142
+ "eval_steps_per_second": 8.556,
143
+ "step": 35000
144
+ },
145
+ {
146
+ "epoch": 1.1466428844438783,
147
+ "grad_norm": 0.5592519044876099,
148
+ "learning_rate": 0.00011893609139772691,
149
+ "loss": 1.265,
150
+ "step": 36000
151
+ },
152
+ {
153
+ "epoch": 1.2421964581475347,
154
+ "grad_norm": 0.594070315361023,
155
+ "learning_rate": 9.700074647841136e-05,
156
+ "loss": 1.2534,
157
+ "step": 39000
158
+ },
159
+ {
160
+ "epoch": 1.274047649382087,
161
+ "eval_loss": 1.260542392730713,
162
+ "eval_runtime": 36.9536,
163
+ "eval_samples_per_second": 273.424,
164
+ "eval_steps_per_second": 8.551,
165
+ "step": 40000
166
+ },
167
+ {
168
+ "epoch": 1.3377500318511912,
169
+ "grad_norm": 0.6026796102523804,
170
+ "learning_rate": 7.629596894514561e-05,
171
+ "loss": 1.2422,
172
+ "step": 42000
173
+ },
174
+ {
175
+ "epoch": 1.4333036055548478,
176
+ "grad_norm": 0.5905404090881348,
177
+ "learning_rate": 5.7302494322958876e-05,
178
+ "loss": 1.2303,
179
+ "step": 45000
180
+ },
181
+ {
182
+ "epoch": 1.4333036055548478,
183
+ "eval_loss": 1.2406779527664185,
184
+ "eval_runtime": 37.226,
185
+ "eval_samples_per_second": 271.423,
186
+ "eval_steps_per_second": 8.489,
187
+ "step": 45000
188
+ },
189
+ {
190
+ "epoch": 1.5288571792585044,
191
+ "grad_norm": 0.6186478137969971,
192
+ "learning_rate": 4.046132411403847e-05,
193
+ "loss": 1.2187,
194
+ "step": 48000
195
+ },
196
+ {
197
+ "epoch": 1.5925595617276085,
198
+ "eval_loss": 1.2243584394454956,
199
+ "eval_runtime": 37.1965,
200
+ "eval_samples_per_second": 271.639,
201
+ "eval_steps_per_second": 8.495,
202
+ "step": 50000
203
+ },
204
+ {
205
+ "epoch": 1.6244107529621608,
206
+ "grad_norm": 0.6473325490951538,
207
+ "learning_rate": 2.6163486369252473e-05,
208
+ "loss": 1.2095,
209
+ "step": 51000
210
+ },
211
+ {
212
+ "epoch": 1.7199643266658173,
213
+ "grad_norm": 0.6508156061172485,
214
+ "learning_rate": 1.4740956572229152e-05,
215
+ "loss": 1.2001,
216
+ "step": 54000
217
+ },
218
+ {
219
+ "epoch": 1.7518155179003694,
220
+ "eval_loss": 1.2133299112319946,
221
+ "eval_runtime": 37.1949,
222
+ "eval_samples_per_second": 271.65,
223
+ "eval_steps_per_second": 8.496,
224
+ "step": 55000
225
+ }
226
+ ],
227
+ "logging_steps": 3000,
228
+ "max_steps": 62792,
229
+ "num_input_tokens_seen": 0,
230
+ "num_train_epochs": 2,
231
+ "save_steps": 500,
232
+ "stateful_callbacks": {
233
+ "TrainerControl": {
234
+ "args": {
235
+ "should_epoch_stop": false,
236
+ "should_evaluate": false,
237
+ "should_log": false,
238
+ "should_save": true,
239
+ "should_training_stop": false
240
+ },
241
+ "attributes": {}
242
+ }
243
+ },
244
+ "total_flos": 1.0638444879387034e+17,
245
+ "train_batch_size": 32,
246
+ "trial_name": null,
247
+ "trial_params": null
248
+ }
checkpoint-55000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47dea390da80729e81a7320baa573ab0bba14035962ea3bf179f44ed8e91a0d9
3
+ size 5304
checkpoint-60000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 32,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 256,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 512,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 30,
19
+ "num_key_value_heads": 8,
20
+ "pad_token_id": 2,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-60000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-60000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39f921a0b25c62636ee1d1e72fc061257ac5bcc99ec5bd898868cd14c91f8727
3
+ size 111503488
checkpoint-60000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1f9eb7a33e964435001f70d9707e54243644755a6ae28777b129bd5bf911ce
3
+ size 223176442
checkpoint-60000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d3b7102895eb0637b0cab516bd672f216b2bf79078a83eb301011a90444f44c
3
+ size 14244