jukofyork commited on
Commit
e4a4015
·
verified ·
0 Parent(s):

Duplicate from jukofyork/DeepSeek-V3-0324-CODER-DRAFT-0.6B-v1.0

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model:
4
+ - Qwen/Qwen2.5-Coder-0.5B-Instruct
5
+ datasets:
6
+ - agentlans/common-crawl-sample
7
+ - bigcode/the-stack-smol-xl
8
+ tags:
9
+ - draft
10
+ - speculative-decoding
11
+ language:
12
+ - zho
13
+ - eng
14
+ - fra
15
+ - spa
16
+ - por
17
+ - deu
18
+ - ita
19
+ - rus
20
+ - jpn
21
+ - kor
22
+ - vie
23
+ - tha
24
+ - ara
25
+ ---
26
+
27
+ ![image.webp](https://cdn-uploads.huggingface.co/production/uploads/65995c45539c808e84c38bf1/KL97x9lVuhmIPXbbKgvyY.webp)
28
+
29
+ A `0.6B` parameter draft (speculative decoding) model for use with [deepseek-ai/DeepSeek-V3-0324](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324) and [deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3).
30
+
31
+ **NOTES**:
32
+
33
+ - This version (unlike the previous [jukofyork/DeepSeek-V3-0324-DRAFT-0.5B-v1.0](https://huggingface.co/jukofyork/DeepSeek-V3-0324-DRAFT-0.5B-v1.0)), was trained using only the [agentlans/common-crawl-sample](https://huggingface.co/datasets/agentlans/common-crawl-sample) and [bigcode/the-stack-smol-xl](https://huggingface.co/datasets/bigcode/the-stack-smol-xl) datasets.
34
+ - This version (unlike the previous [jukofyork/DeepSeek-V3-0324-DRAFT-0.5B-v1.0](https://huggingface.co/jukofyork/DeepSeek-V3-0324-DRAFT-0.5B-v1.0)), doesn't trim the heads down from 14 to 12.
35
+
36
+ See [jukofyork/DeepSeek-V3-0324-CODER-DRAFT-0.6B-v1.0-GGUF](https://huggingface.co/jukofyork/DeepSeek-V3-0324-CODER-DRAFT-0.6B-v1.0-GGUF) for the models in GGUF format.
37
+
38
+ ---
39
+
40
+ # How the model was created
41
+
42
+ ## 1. The initial model was created from [Qwen/Qwen2.5-Coder-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct) using [transplant-vocab](https://github.com/jukofyork/transplant-vocab):
43
+
44
+ ```sh
45
+ python ./transplant_vocab.py \
46
+ Qwen2.5-Coder-0.5B-Instruct \
47
+ DeepSeek-V3-0324-BF16 \
48
+ DeepSeek-V3-0324-CODER-DRAFT-0.6B-UNTRAINED \
49
+ --override "<|▁pad▁|>" "<|endoftext|>" \
50
+ --override "<|fim▁hole|>" "<|fim_middle|>" \
51
+ --override "<|fim▁begin|>" "<|fim_prefix|>" \
52
+ --override "<|fim▁end|>" "<|fim_suffix|>" \
53
+ --override "<|User|>" "<|im_start|>user\\n" \
54
+ --override "<|Assistant|>" "<|im_start|>assistant\\n" \
55
+ --override "<|EOT|>" "<|endoftext|>" \
56
+ --override "<|tool▁calls▁begin|>" "<tool_call>" \
57
+ --override "<|tool▁call▁begin|>" "<tool_call>" \
58
+ --override "<|tool▁outputs▁begin|>" "<tool_call>" \
59
+ --override "<|tool▁output▁begin|>" "<tool_call>" \
60
+ --override "<|tool▁calls▁end|>" "</tool_call>" \
61
+ --override "<|tool▁call▁end|>" "</tool_call>" \
62
+ --override "<|tool▁outputs▁end|>" "</tool_call>" \
63
+ --override "<|tool▁output▁end|>" "</tool_call>" \
64
+ --override "<|tool▁sep|>" "</tool_call>"
65
+ ```
66
+
67
+ ## 2. The following datasets were merged to create a fine-tuning dataset of ~2.5B tokens:
68
+
69
+ - [agentlans/common-crawl-sample](https://huggingface.co/datasets/agentlans/common-crawl-sample)
70
+ - [bigcode/the-stack-smol-xl](https://huggingface.co/datasets/bigcode/the-stack-smol-xl)
71
+
72
+ formatted just between `<|end▁of▁sentence|>` tags.
73
+
74
+ ## 3. The model was then trained using [qlora-pipe](https://github.com/tdrussell/qlora-pipe) for 1 epoch with a batch size of 120 and a sequence length of 32k (~4M tokens per step):
75
+
76
+ ```toml
77
+ # Resume a prior run
78
+ resume_from_checkpoint = false
79
+
80
+ # Paths
81
+ model = 'DeepSeek-V3-0324-CODER-DRAFT-0.6B-UNTRAINED'
82
+ output_dir = 'DeepSeek-V3-0324-CODER-DRAFT-0.6B'
83
+
84
+ # Optimization configuration
85
+ full_fine_tune = true
86
+ epochs = 1
87
+ lr_scheduler = 'cosine'
88
+ warmup_steps = 100
89
+
90
+ # Performance settings
91
+ pipeline_stages = 1
92
+ logging_steps = 1
93
+ eval_steps = 100
94
+ save_steps = 100
95
+ checkpoint_every_n_minutes = 60
96
+ eval_before_first_step = true
97
+ eval_after_last_step = true
98
+ model_weight_dtype = 'bfloat16'
99
+ keep_states = 3
100
+ group_by_length = true
101
+ activation_checkpointing = 'unsloth'
102
+
103
+ # Dataset configuration
104
+ dataset_combination_mode = 'concatenate'
105
+ eval_gradient_accumulation_steps = 20
106
+
107
+ [optimizer]
108
+ type = 'adamw_kahan'
109
+ lr = 5e-5
110
+ beta1 = 0.9
111
+ beta2 = 0.999
112
+ weight_decay = 0.01
113
+
114
+ [[datasets]]
115
+ name = 'mixed_data'
116
+ dataset_type = 'textfile'
117
+ dataset_path = 'mixed_data/*.txt'
118
+ sequence_len = 32768
119
+ eval_size = 0.01
120
+ ```
121
+
122
+ ```json
123
+ {
124
+ "train_micro_batch_size_per_gpu": 1,
125
+ "gradient_accumulation_steps": 20,
126
+ "gradient_clipping": 1.0,
127
+ "steps_per_print": 1
128
+ }
129
+ ```
130
+
131
+ I used six `RTX A6000` GPUs over three nodes and hence the `120` batch size (`6 x 20 gradient accumulation steps = 120`).
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepSeek-V3-0324-CODER-DRAFT-0.6B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 1,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 896,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 163840,
14
+ "max_window_layers": 21,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
+ "pad_token_id": 1,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "factor": 4.0,
23
+ "original_max_position_embeddings": 32768,
24
+ "type": "yarn"
25
+ },
26
+ "rope_theta": 1000000.0,
27
+ "sliding_window": null,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "bfloat16",
30
+ "transformers_version": "4.44.2",
31
+ "use_cache": true,
32
+ "use_sliding_window": false,
33
+ "vocab_size": 129280
34
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.44.2"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3827b3badd63324ecd8b2986b9f6739f6f0d40b23a1f62e282db65b2666787c8
3
+ size 1179168272
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff