thiomajid commited on
Commit
7cdcfe4
·
verified ·
1 Parent(s): f9c8d85

Model save

Browse files
README.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: SlimPajama_10k_SmolLM2-360M_distil_ratio_no_additive_norm
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ # SlimPajama_10k_SmolLM2-360M_distil_ratio_no_additive_norm
14
+
15
+ This model is a fine-tuned version of [](https://huggingface.co/) on the None dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 5.9171
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 0.0002
37
+ - train_batch_size: 5
38
+ - eval_batch_size: 5
39
+ - seed: 42
40
+ - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
41
+ - lr_scheduler_type: cosine
42
+ - lr_scheduler_warmup_ratio: 0.1
43
+ - num_epochs: 1
44
+ - mixed_precision_training: Native AMP
45
+
46
+ ### Training results
47
+
48
+ | Training Loss | Epoch | Step | Validation Loss |
49
+ |:-------------:|:-----:|:----:|:---------------:|
50
+ | 9.1473 | 0.125 | 250 | 7.2546 |
51
+ | 7.0097 | 0.25 | 500 | 6.7396 |
52
+ | 6.6398 | 0.375 | 750 | 6.4495 |
53
+ | 6.4032 | 0.5 | 1000 | 6.2413 |
54
+ | 6.204 | 0.625 | 1250 | 6.0940 |
55
+ | 6.07 | 0.75 | 1500 | 5.9828 |
56
+ | 5.9685 | 0.875 | 1750 | 5.9299 |
57
+ | 5.9672 | 1.0 | 2000 | 5.9171 |
58
+
59
+
60
+ ### Framework versions
61
+
62
+ - Transformers 4.48.3
63
+ - Pytorch 2.5.1+cu124
64
+ - Datasets 3.3.2
65
+ - Tokenizers 0.21.0
config.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DistilxLSTM"
4
+ ],
5
+ "model_type": "xlstm",
6
+ "pad_token_id": 2,
7
+ "torch_dtype": "float32",
8
+ "transformers_version": "4.48.3",
9
+ "xlstm_cfg": {
10
+ "_block_map": "1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0",
11
+ "add_embedding_dropout": false,
12
+ "add_post_blocks_norm": true,
13
+ "bias": false,
14
+ "context_length": 256,
15
+ "dropout": 0.0,
16
+ "embedding_dim": 960,
17
+ "mlstm_block": {
18
+ "_block_idx": null,
19
+ "_num_blocks": 16,
20
+ "mlstm": {
21
+ "_inner_embedding_dim": 1920,
22
+ "_num_blocks": 16,
23
+ "_proj_up_dim": 1920,
24
+ "bias": false,
25
+ "context_length": 256,
26
+ "conv1d_kernel_size": 4,
27
+ "dropout": 0.0,
28
+ "embedding_dim": 960,
29
+ "num_heads": 4,
30
+ "proj_factor": 2.0,
31
+ "qkv_proj_blocksize": 32,
32
+ "round_proj_up_dim_up": true,
33
+ "round_proj_up_to_multiple_of": 64
34
+ }
35
+ },
36
+ "num_blocks": 16,
37
+ "slstm_at": [
38
+ 0,
39
+ 4,
40
+ 8,
41
+ 12
42
+ ],
43
+ "slstm_block": {
44
+ "_block_idx": null,
45
+ "_num_blocks": 16,
46
+ "feedforward": {
47
+ "_num_blocks": 1,
48
+ "_proj_up_dim": 0,
49
+ "act_fn": "gelu",
50
+ "bias": false,
51
+ "dropout": 0.0,
52
+ "embedding_dim": -1,
53
+ "ff_type": "ffn_gated",
54
+ "proj_factor": 1.7,
55
+ "round_proj_up_dim_up": true,
56
+ "round_proj_up_to_multiple_of": 64
57
+ },
58
+ "slstm": {
59
+ "_block_idx": null,
60
+ "_num_blocks": 16,
61
+ "backend": "cuda",
62
+ "batch_size": 8,
63
+ "bias_init": "powerlaw_blockdependent",
64
+ "constants": {},
65
+ "conv1d_kernel_size": 4,
66
+ "dropout": 0.0,
67
+ "dtype": "bfloat16",
68
+ "dtype_a": "float32",
69
+ "dtype_b": "float32",
70
+ "dtype_g": "bfloat16",
71
+ "dtype_r": "bfloat16",
72
+ "dtype_s": "bfloat16",
73
+ "dtype_w": "bfloat16",
74
+ "embedding_dim": 960,
75
+ "enable_automatic_mixed_precision": true,
76
+ "forward_clipval": null,
77
+ "function": "slstm",
78
+ "gradient_recurrent_clipval": null,
79
+ "gradient_recurrent_cut": false,
80
+ "group_norm_weight": true,
81
+ "hidden_size": 960,
82
+ "initial_val": 0.0,
83
+ "input_shape": "BSGNH",
84
+ "internal_input_shape": "SBNGH",
85
+ "num_gates": 4,
86
+ "num_heads": 4,
87
+ "num_states": 4,
88
+ "output_shape": "BNSH",
89
+ "recurrent_weight_init": "zeros"
90
+ }
91
+ },
92
+ "tie_weights": false,
93
+ "vocab_size": 49152,
94
+ "weight_decay_on_embedding": false
95
+ }
96
+ }
events.out.tfevents.1741447692.ef0fed4470ac.1412.0manual_logs ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:669d773e69f4432118654f8c0f2fda720cdc5d7e58b98dea3fb062901ccb3575
3
+ size 5119656
events.out.tfevents.1741447693.ef0fed4470ac.1412.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8295c03b48a70a216c0c7bfb4eae4a1bc467227d4126af5b4d16a5187ff6c80
3
+ size 11930
events.out.tfevents.1741450037.ef0fed4470ac.1412.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f7aa8fcbeb3920e9f92716963dc064bd9b2e0cd4286687fa4012df14abd4c61
3
+ size 193
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ebcffbeb034a8cdc7420fe4ad01b33205cd5d2f3bcec0b1204f3a9fc0c2c67a
3
+ size 761042760
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|im_start|>",
143
+ "<|im_end|>"
144
+ ],
145
+ "bos_token": "<|im_start|>",
146
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
147
+ "clean_up_tokenization_spaces": false,
148
+ "eos_token": "<|im_end|>",
149
+ "extra_special_tokens": {},
150
+ "model_max_length": 8192,
151
+ "pad_token": "<|im_end|>",
152
+ "tokenizer_class": "GPT2Tokenizer",
153
+ "unk_token": "<|endoftext|>",
154
+ "vocab_size": 49152
155
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c11814ba2b844e5146f38162738d7be033331e33a0d4f4a11158374d806b9d4
3
+ size 6264
vocab.json ADDED
The diff for this file is too large to render. See raw diff